From 58ff2981667262f77d57219fc9cef2a43a740159 Mon Sep 17 00:00:00 2001
From: Michael Lambert <mlambert@quicinc.com>
Date: Wed, 9 Feb 2022 18:15:45 -0800
Subject: Hexagon (target/hexagon) fix bug in circular addressing

Versions V3 and earlier should treat the "K_const" and "length" values
as unsigned.

Modified circ_test_v3() in tests/tcg/hexagon/circ.c to reproduce the bug

Signed-off-by: Michael Lambert <mlambert@quicinc.com>
Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <20220210021556.9217-2-tsimpson@quicinc.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/tcg/hexagon/circ.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tests')

diff --git a/tests/tcg/hexagon/circ.c b/tests/tcg/hexagon/circ.c
index 67a1aa3..354416e 100644
--- a/tests/tcg/hexagon/circ.c
+++ b/tests/tcg/hexagon/circ.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -415,7 +415,8 @@ static void circ_test_v3(void)
 {
     int *p = wbuf;
     int size = 15;
-    int K = 4;      /* 64 bytes */
+    /* set high bit in K to test unsigned extract in fcirc */
+    int K = 8;      /* 1024 bytes */
     int element;
     int i;
 
-- 
cgit v1.1


From 5b0043c67ccd7b88e0858204e79b09448adf4b34 Mon Sep 17 00:00:00 2001
From: Taylor Simpson <tsimpson@quicinc.com>
Date: Wed, 9 Feb 2022 18:15:46 -0800
Subject: Hexagon HVX (target/hexagon) fix bug in HVX saturate instructions

Two tests added to tests/tcg/hexagon/hvx_misc.c
    v21.uw = vadd(v11.uw, v10.uw):sat
    v25:24.uw = vsub(v17:16.uw, v27:26.uw):sat

Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <20220210021556.9217-3-tsimpson@quicinc.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/tcg/hexagon/hvx_misc.c | 71 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 70 insertions(+), 1 deletion(-)

(limited to 'tests')

diff --git a/tests/tcg/hexagon/hvx_misc.c b/tests/tcg/hexagon/hvx_misc.c
index 312bb98..b896f58 100644
--- a/tests/tcg/hexagon/hvx_misc.c
+++ b/tests/tcg/hexagon/hvx_misc.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2021-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -19,6 +19,7 @@
 #include <stdint.h>
 #include <stdbool.h>
 #include <string.h>
+#include <limits.h>
 
 int err;
 
@@ -432,6 +433,71 @@ TEST_PRED_OP2(pred_and, and, &, "")
 TEST_PRED_OP2(pred_and_n, and, &, "!")
 TEST_PRED_OP2(pred_xor, xor, ^, "")
 
+static void test_vadduwsat(void)
+{
+    /*
+     * Test for saturation by adding two numbers that add to more than UINT_MAX
+     * and make sure the result saturates to UINT_MAX
+     */
+    const uint32_t x = 0xffff0000;
+    const uint32_t y = 0x000fffff;
+
+    memset(expect, 0x12, sizeof(MMVector));
+    memset(output, 0x34, sizeof(MMVector));
+
+    asm volatile ("v10 = vsplat(%0)\n\t"
+                  "v11 = vsplat(%1)\n\t"
+                  "v21.uw = vadd(v11.uw, v10.uw):sat\n\t"
+                  "vmem(%2+#0) = v21\n\t"
+                  : /* no outputs */
+                  : "r"(x), "r"(y), "r"(output)
+                  : "v10", "v11", "v21", "memory");
+
+    for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
+        expect[0].uw[j] = UINT_MAX;
+    }
+
+    check_output_w(__LINE__, 1);
+}
+
+static void test_vsubuwsat_dv(void)
+{
+    /*
+     * Test for saturation by subtracting two numbers where the result is
+     * negative and make sure the result saturates to zero
+     *
+     * vsubuwsat_dv operates on an HVX register pair, so we'll have a
+     * pair of subtractions
+     *     w - x < 0
+     *     y - z < 0
+     */
+    const uint32_t w = 0x000000b7;
+    const uint32_t x = 0xffffff4e;
+    const uint32_t y = 0x31fe88e7;
+    const uint32_t z = 0x7fffff79;
+
+    memset(expect, 0x12, sizeof(MMVector) * 2);
+    memset(output, 0x34, sizeof(MMVector) * 2);
+
+    asm volatile ("v16 = vsplat(%0)\n\t"
+                  "v17 = vsplat(%1)\n\t"
+                  "v26 = vsplat(%2)\n\t"
+                  "v27 = vsplat(%3)\n\t"
+                  "v25:24.uw = vsub(v17:16.uw, v27:26.uw):sat\n\t"
+                  "vmem(%4+#0) = v24\n\t"
+                  "vmem(%4+#1) = v25\n\t"
+                  : /* no outputs */
+                  : "r"(w), "r"(y), "r"(x), "r"(z), "r"(output)
+                  : "v16", "v17", "v24", "v25", "v26", "v27", "memory");
+
+    for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
+        expect[0].uw[j] = 0x00000000;
+        expect[1].uw[j] = 0x00000000;
+    }
+
+    check_output_w(__LINE__, 2);
+}
+
 int main()
 {
     init_buffers();
@@ -464,6 +530,9 @@ int main()
     test_pred_and_n(true);
     test_pred_xor(false);
 
+    test_vadduwsat();
+    test_vsubuwsat_dv();
+
     puts(err ? "FAIL" : "PASS");
     return err ? 1 : 0;
 }
-- 
cgit v1.1


From 77ccf44453a83e17cc830df700cc072f6bcf6a71 Mon Sep 17 00:00:00 2001
From: Taylor Simpson <tsimpson@quicinc.com>
Date: Wed, 9 Feb 2022 18:15:49 -0800
Subject: Hexagon (target/hexagon) properly handle denorm in
 arch_sf_recip_common

The arch_sf_recip_common function was calling float32_getexp which
adjusts for denorm, but the we actually need the raw exponent bits.

This function is called from 3 instructions
    sfrecipa
    sffixupn
    sffixupd

Test cases added to tests/tcg/hexagon/fpstuff.c

Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <20220210021556.9217-6-tsimpson@quicinc.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/tcg/hexagon/fpstuff.c | 44 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

(limited to 'tests')

diff --git a/tests/tcg/hexagon/fpstuff.c b/tests/tcg/hexagon/fpstuff.c
index 0dff429..043f18f 100644
--- a/tests/tcg/hexagon/fpstuff.c
+++ b/tests/tcg/hexagon/fpstuff.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2020-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2020-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -38,6 +38,8 @@ const int SF_NaN_special =                0x7f800001;
 const int SF_ANY =                        0x3f800000;
 const int SF_HEX_NAN =                    0xffffffff;
 const int SF_small_neg =                  0xab98fba8;
+const int SF_denorm =                     0x00000001;
+const int SF_random =                     0x346001d6;
 
 const long long DF_NaN =                  0x7ff8000000000000ULL;
 const long long DF_ANY =                  0x3f80000000000000ULL;
@@ -250,10 +252,11 @@ static void check_dfminmax(void)
     check_fpstatus(usr, FPINVF);
 }
 
-static void check_recip_exception(void)
+static void check_sfrecipa(void)
 {
     int result;
     int usr;
+    int pred;
 
     /*
      * Check that sfrecipa doesn't set status bits when
@@ -329,6 +332,17 @@ static void check_recip_exception(void)
          : "r2", "p0", "usr");
     check32(result, 0x3f800000);
     check_fpstatus(usr, 0);
+
+    /*
+     * Check that sfrecipa properly handles denorm
+     */
+    asm (CLEAR_FPSTATUS
+         "%0,p0 = sfrecipa(%2, %3)\n\t"
+         "%1 = p0\n\t"
+         : "=r"(result), "=r"(pred) : "r"(SF_denorm), "r"(SF_random)
+         : "p0", "usr");
+    check32(result, 0x6a920001);
+    check32(pred, 0x80);
 }
 
 static void check_canonical_NaN(void)
@@ -455,6 +469,28 @@ static void check_invsqrta(void)
     check32(predval, 0x0);
 }
 
+static void check_sffixupn(void)
+{
+    int result;
+
+    /* Check that sffixupn properly deals with denorm */
+    asm volatile("%0 = sffixupn(%1, %2)\n\t"
+                 : "=r"(result)
+                 : "r"(SF_random), "r"(SF_denorm));
+    check32(result, 0x246001d6);
+}
+
+static void check_sffixupd(void)
+{
+    int result;
+
+    /* Check that sffixupd properly deals with denorm */
+    asm volatile("%0 = sffixupd(%1, %2)\n\t"
+                 : "=r"(result)
+                 : "r"(SF_denorm), "r"(SF_random));
+    check32(result, 0x146001d6);
+}
+
 static void check_float2int_convs()
 {
     int res32;
@@ -602,9 +638,11 @@ int main()
     check_compare_exception();
     check_sfminmax();
     check_dfminmax();
-    check_recip_exception();
+    check_sfrecipa();
     check_canonical_NaN();
     check_invsqrta();
+    check_sffixupn();
+    check_sffixupd();
     check_float2int_convs();
 
     puts(err ? "FAIL" : "PASS");
-- 
cgit v1.1


From d76dd816bf328a66ce57b2fb27d046656d3ab411 Mon Sep 17 00:00:00 2001
From: Taylor Simpson <tsimpson@quicinc.com>
Date: Tue, 15 Feb 2022 20:39:39 -0800
Subject: Hexagon (target/hexagon) properly handle NaN in
 dfmin/dfmax/sfmin/sfmax

The float??_minnum implementation differs from Hexagon for SNaN,
it returns NaN, but Hexagon returns the other input.  So, we use
float??_minimum_number.

Test cases added to tests/tcg/hexagon/fpstuff.c

Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <20220308190410.22355-1-tsimpson@quicinc.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/tcg/hexagon/fpstuff.c | 79 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 62 insertions(+), 17 deletions(-)

(limited to 'tests')

diff --git a/tests/tcg/hexagon/fpstuff.c b/tests/tcg/hexagon/fpstuff.c
index 043f18f..56bf562 100644
--- a/tests/tcg/hexagon/fpstuff.c
+++ b/tests/tcg/hexagon/fpstuff.c
@@ -41,7 +41,8 @@ const int SF_small_neg =                  0xab98fba8;
 const int SF_denorm =                     0x00000001;
 const int SF_random =                     0x346001d6;
 
-const long long DF_NaN =                  0x7ff8000000000000ULL;
+const long long DF_QNaN =                 0x7ff8000000000000ULL;
+const long long DF_SNaN =                 0x7ff7000000000000ULL;
 const long long DF_ANY =                  0x3f80000000000000ULL;
 const long long DF_HEX_NAN =              0xffffffffffffffffULL;
 const long long DF_small_neg =            0xbd731f7500000000ULL;
@@ -128,7 +129,7 @@ static void check_compare_exception(void)
          "p0 = dfcmp.eq(%2, %3)\n\t"
          "%0 = p0\n\t"
          "%1 = usr\n\t"
-         : "=r"(cmp), "=r"(usr) : "r"(DF_NaN), "r"(DF_ANY)
+         : "=r"(cmp), "=r"(usr) : "r"(DF_QNaN), "r"(DF_ANY)
          : "r2", "p0", "usr");
     check32(cmp, 0);
     check_fpstatus(usr, 0);
@@ -137,7 +138,7 @@ static void check_compare_exception(void)
          "p0 = dfcmp.gt(%2, %3)\n\t"
          "%0 = p0\n\t"
          "%1 = usr\n\t"
-         : "=r"(cmp), "=r"(usr) : "r"(DF_NaN), "r"(DF_ANY)
+         : "=r"(cmp), "=r"(usr) : "r"(DF_QNaN), "r"(DF_ANY)
          : "r2", "p0", "usr");
     check32(cmp, 0);
     check_fpstatus(usr, 0);
@@ -146,7 +147,7 @@ static void check_compare_exception(void)
          "p0 = dfcmp.ge(%2, %3)\n\t"
          "%0 = p0\n\t"
          "%1 = usr\n\t"
-         : "=r"(cmp), "=r"(usr) : "r"(DF_NaN), "r"(DF_ANY)
+         : "=r"(cmp), "=r"(usr) : "r"(DF_QNaN), "r"(DF_ANY)
          : "r2", "p0", "usr");
     check32(cmp, 0);
     check_fpstatus(usr, 0);
@@ -208,7 +209,7 @@ static void check_dfminmax(void)
     int usr;
 
     /*
-     * Execute dfmin/dfmax instructions with one operand as NaN
+     * Execute dfmin/dfmax instructions with one operand as SNaN
      * Check that
      *     Result is the other operand
      *     Invalid bit in USR is set
@@ -216,7 +217,7 @@ static void check_dfminmax(void)
      asm (CLEAR_FPSTATUS
          "%0 = dfmin(%2, %3)\n\t"
          "%1 = usr\n\t"
-         : "=r"(minmax), "=r"(usr) : "r"(DF_NaN), "r"(DF_ANY)
+         : "=r"(minmax), "=r"(usr) : "r"(DF_SNaN), "r"(DF_ANY)
          : "r2", "usr");
     check64(minmax, DF_ANY);
     check_fpstatus(usr, FPINVF);
@@ -224,13 +225,35 @@ static void check_dfminmax(void)
     asm (CLEAR_FPSTATUS
          "%0 = dfmax(%2, %3)\n\t"
          "%1 = usr\n\t"
-         : "=r"(minmax), "=r"(usr) : "r"(DF_NaN), "r"(DF_ANY)
+         : "=r"(minmax), "=r"(usr) : "r"(DF_SNaN), "r"(DF_ANY)
          : "r2", "usr");
     check64(minmax, DF_ANY);
     check_fpstatus(usr, FPINVF);
 
     /*
-     * Execute dfmin/dfmax instructions with both operands NaN
+     * Execute dfmin/dfmax instructions with one operand as QNaN
+     * Check that
+     *     Result is the other operand
+     *     No bit in USR is set
+     */
+     asm (CLEAR_FPSTATUS
+         "%0 = dfmin(%2, %3)\n\t"
+         "%1 = usr\n\t"
+         : "=r"(minmax), "=r"(usr) : "r"(DF_QNaN), "r"(DF_ANY)
+         : "r2", "usr");
+    check64(minmax, DF_ANY);
+    check_fpstatus(usr, 0);
+
+    asm (CLEAR_FPSTATUS
+         "%0 = dfmax(%2, %3)\n\t"
+         "%1 = usr\n\t"
+         : "=r"(minmax), "=r"(usr) : "r"(DF_QNaN), "r"(DF_ANY)
+         : "r2", "usr");
+    check64(minmax, DF_ANY);
+    check_fpstatus(usr, 0);
+
+    /*
+     * Execute dfmin/dfmax instructions with both operands SNaN
      * Check that
      *     Result is DF_HEX_NAN
      *     Invalid bit in USR is set
@@ -238,7 +261,7 @@ static void check_dfminmax(void)
     asm (CLEAR_FPSTATUS
          "%0 = dfmin(%2, %3)\n\t"
          "%1 = usr\n\t"
-         : "=r"(minmax), "=r"(usr) : "r"(DF_NaN), "r"(DF_NaN)
+         : "=r"(minmax), "=r"(usr) : "r"(DF_SNaN), "r"(DF_SNaN)
          : "r2", "usr");
     check64(minmax, DF_HEX_NAN);
     check_fpstatus(usr, FPINVF);
@@ -246,10 +269,32 @@ static void check_dfminmax(void)
     asm (CLEAR_FPSTATUS
          "%0 = dfmax(%2, %3)\n\t"
          "%1 = usr\n\t"
-         : "=r"(minmax), "=r"(usr) : "r"(DF_NaN), "r"(DF_NaN)
+         : "=r"(minmax), "=r"(usr) : "r"(DF_SNaN), "r"(DF_SNaN)
          : "r2", "usr");
     check64(minmax, DF_HEX_NAN);
     check_fpstatus(usr, FPINVF);
+
+    /*
+     * Execute dfmin/dfmax instructions with both operands QNaN
+     * Check that
+     *     Result is DF_HEX_NAN
+     *     No bit in USR is set
+     */
+    asm (CLEAR_FPSTATUS
+         "%0 = dfmin(%2, %3)\n\t"
+         "%1 = usr\n\t"
+         : "=r"(minmax), "=r"(usr) : "r"(DF_QNaN), "r"(DF_QNaN)
+         : "r2", "usr");
+    check64(minmax, DF_HEX_NAN);
+    check_fpstatus(usr, 0);
+
+    asm (CLEAR_FPSTATUS
+         "%0 = dfmax(%2, %3)\n\t"
+         "%1 = usr\n\t"
+         : "=r"(minmax), "=r"(usr) : "r"(DF_QNaN), "r"(DF_QNaN)
+         : "r2", "usr");
+    check64(minmax, DF_HEX_NAN);
+    check_fpstatus(usr, 0);
 }
 
 static void check_sfrecipa(void)
@@ -425,7 +470,7 @@ static void check_canonical_NaN(void)
     asm(CLEAR_FPSTATUS
         "%0 = convert_df2sf(%2)\n\t"
         "%1 = usr\n\t"
-        : "=r"(sf_result), "=r"(usr) : "r"(DF_NaN)
+        : "=r"(sf_result), "=r"(usr) : "r"(DF_QNaN)
         : "r2", "usr");
     check32(sf_result, SF_HEX_NAN);
     check_fpstatus(usr, 0);
@@ -433,7 +478,7 @@ static void check_canonical_NaN(void)
     asm(CLEAR_FPSTATUS
         "%0 = dfadd(%2, %3)\n\t"
         "%1 = usr\n\t"
-        : "=r"(df_result), "=r"(usr) : "r"(DF_NaN), "r"(DF_ANY)
+        : "=r"(df_result), "=r"(usr) : "r"(DF_QNaN), "r"(DF_ANY)
         : "r2", "usr");
     check64(df_result, DF_HEX_NAN);
     check_fpstatus(usr, 0);
@@ -441,7 +486,7 @@ static void check_canonical_NaN(void)
     asm(CLEAR_FPSTATUS
         "%0 = dfsub(%2, %3)\n\t"
         "%1 = usr\n\t"
-        : "=r"(df_result), "=r"(usr) : "r"(DF_NaN), "r"(DF_ANY)
+        : "=r"(df_result), "=r"(usr) : "r"(DF_QNaN), "r"(DF_ANY)
         : "r2", "usr");
     check64(df_result, DF_HEX_NAN);
     check_fpstatus(usr, 0);
@@ -603,7 +648,7 @@ static void check_float2int_convs()
     asm(CLEAR_FPSTATUS
         "%0 = convert_df2w(%2)\n\t"
         "%1 = usr\n\t"
-        : "=r"(res32), "=r"(usr) : "r"(DF_NaN)
+        : "=r"(res32), "=r"(usr) : "r"(DF_QNaN)
         : "r2", "usr");
     check32(res32, -1);
     check_fpstatus(usr, FPINVF);
@@ -611,7 +656,7 @@ static void check_float2int_convs()
     asm(CLEAR_FPSTATUS
         "%0 = convert_df2w(%2):chop\n\t"
         "%1 = usr\n\t"
-        : "=r"(res32), "=r"(usr) : "r"(DF_NaN)
+        : "=r"(res32), "=r"(usr) : "r"(DF_QNaN)
         : "r2", "usr");
     check32(res32, -1);
     check_fpstatus(usr, FPINVF);
@@ -619,7 +664,7 @@ static void check_float2int_convs()
     asm(CLEAR_FPSTATUS
         "%0 = convert_df2d(%2)\n\t"
         "%1 = usr\n\t"
-        : "=r"(res64), "=r"(usr) : "r"(DF_NaN)
+        : "=r"(res64), "=r"(usr) : "r"(DF_QNaN)
         : "r2", "usr");
     check64(res64, -1);
     check_fpstatus(usr, FPINVF);
@@ -627,7 +672,7 @@ static void check_float2int_convs()
     asm(CLEAR_FPSTATUS
         "%0 = convert_df2d(%2):chop\n\t"
         "%1 = usr\n\t"
-        : "=r"(res64), "=r"(usr) : "r"(DF_NaN)
+        : "=r"(res64), "=r"(usr) : "r"(DF_QNaN)
         : "r2", "usr");
     check64(res64, -1);
     check_fpstatus(usr, FPINVF);
-- 
cgit v1.1


From 4d04395a1716c669cf634a90e768c1baa0e68aff Mon Sep 17 00:00:00 2001
From: Taylor Simpson <tsimpson@quicinc.com>
Date: Wed, 9 Feb 2022 18:15:50 -0800
Subject: Hexagon (tests/tcg/hexagon) test instructions that might set bits in
 USR

Hexagon has ~200 instructions that set the saturate bit in USR, these
were broken into groups of similar instructions and one instruction
from each group is tested with at least one input that does not
saturate and at least one input that does saturate.

Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <20220210021556.9217-7-tsimpson@quicinc.com>
Acked-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/tcg/hexagon/Makefile.target |   8 +-
 tests/tcg/hexagon/usr.c           | 798 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 805 insertions(+), 1 deletion(-)
 create mode 100644 tests/tcg/hexagon/usr.c

(limited to 'tests')

diff --git a/tests/tcg/hexagon/Makefile.target b/tests/tcg/hexagon/Makefile.target
index 8b07a28..23b9870 100644
--- a/tests/tcg/hexagon/Makefile.target
+++ b/tests/tcg/hexagon/Makefile.target
@@ -1,5 +1,5 @@
 ##
-##  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+##  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
 ##
 ##  This program is free software; you can redistribute it and/or modify
 ##  it under the terms of the GNU General Public License as published by
@@ -30,6 +30,7 @@ first: $(HEX_SRC)/first.S
 HEX_TESTS = first
 HEX_TESTS += hex_sigsegv
 HEX_TESTS += misc
+HEX_TESTS += usr
 HEX_TESTS += preg_alias
 HEX_TESTS += dual_stores
 HEX_TESTS += multi_result
@@ -43,3 +44,8 @@ HEX_TESTS += fpstuff
 HEX_TESTS += overflow
 
 TESTS += $(HEX_TESTS)
+
+# This test has to be compiled for the -mv67t target
+usr: usr.c
+	$(CC) $(CFLAGS) -mv67t -O2 -Wno-inline-asm -Wno-expansion-to-defined $< -o $@ $(LDFLAGS)
+
diff --git a/tests/tcg/hexagon/usr.c b/tests/tcg/hexagon/usr.c
new file mode 100644
index 0000000..e827272
--- /dev/null
+++ b/tests/tcg/hexagon/usr.c
@@ -0,0 +1,798 @@
+/*
+ *  Copyright(c) 2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Test instructions that might set bits in user status register (USR)
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+
+int err;
+
+static void __check(int line, uint32_t val, uint32_t expect)
+{
+    if (val != expect) {
+        printf("ERROR at line %d: %d != %d\n", line, val, expect);
+        err++;
+    }
+}
+
+#define check(RES, EXP) __check(__LINE__, RES, EXP)
+
+static void __check32(int line, uint32_t val, uint32_t expect)
+{
+    if (val != expect) {
+        printf("ERROR at line %d: 0x%08x != 0x%08x\n", line, val, expect);
+        err++;
+    }
+}
+
+#define check32(RES, EXP) __check32(__LINE__, RES, EXP)
+
+static void __check64(int line, uint64_t val, uint64_t expect)
+{
+    if (val != expect) {
+        printf("ERROR at line %d: 0x%016llx != 0x%016llx\n", line, val, expect);
+        err++;
+    }
+}
+
+#define check64(RES, EXP) __check64(__LINE__, RES, EXP)
+
+/*
+ * Some of the instructions tested are only available on certain versions
+ * of the Hexagon core
+ */
+#define CORE_HAS_AUDIO    (__HEXAGON_ARCH__ >= 67 && defined(__HEXAGON_AUDIO__))
+#define CORE_IS_V67       (__HEXAGON_ARCH__ >= 67)
+
+/* Define the bits in Hexagon USR register */
+#define USR_OVF_BIT          0        /* Sticky saturation overflow */
+#define USR_FPINVF_BIT       1        /* IEEE FP invalid sticky flag */
+#define USR_FPDBZF_BIT       2        /* IEEE FP divide-by-zero sticky flag */
+#define USR_FPOVFF_BIT       3        /* IEEE FP overflow sticky flag */
+#define USR_FPUNFF_BIT       4        /* IEEE FP underflow sticky flag */
+#define USR_FPINPF_BIT       5        /* IEEE FP inexact sticky flag */
+
+/* Corresponding values in USR */
+#define USR_CLEAR            0
+#define USR_OVF              (1 << USR_OVF_BIT)
+#define USR_FPINVF           (1 << USR_FPINVF_BIT)
+#define USR_FPDBZF           (1 << USR_FPDBZF_BIT)
+#define USR_FPOVFF           (1 << USR_FPOVFF_BIT)
+#define USR_FPUNFF           (1 << USR_FPUNFF_BIT)
+#define USR_FPINPF           (1 << USR_FPINPF_BIT)
+
+/*
+ * Templates for functions to execute an instruction
+ *
+ * The templates vary by the number of arguments and the types of the args
+ * and result.  We use one letter in the macro name for the result and each
+ * argument:
+ *     x             unknown (specified in a subsequent template) or don't care
+ *     R             register (32 bits)
+ *     P             pair (64 bits)
+ *     p             predicate
+ *     I             immediate
+ *     Xx            read/write
+ */
+
+/* Clear bits 0-5 in USR */
+#define CLEAR_USRBITS \
+    "r2 = usr\n\t" \
+    "r2 = and(r2, #0xffffffc0)\n\t" \
+    "usr = r2\n\t"
+
+/* Template for instructions with one register operand */
+#define FUNC_x_OP_x(RESTYPE, SRCTYPE, NAME, INSN) \
+static RESTYPE NAME(SRCTYPE src, uint32_t *usr_result) \
+{ \
+    RESTYPE result; \
+    uint32_t usr; \
+    asm(CLEAR_USRBITS \
+        INSN  "\n\t" \
+        "%1 = usr\n\t" \
+        : "=r"(result), "=r"(usr) \
+        : "r"(src) \
+        : "r2", "usr"); \
+      *usr_result = usr & 0x3f; \
+      return result; \
+}
+
+#define FUNC_R_OP_R(NAME, INSN) \
+FUNC_x_OP_x(uint32_t, uint32_t, NAME, INSN)
+
+#define FUNC_R_OP_P(NAME, INSN) \
+FUNC_x_OP_x(uint32_t, uint64_t, NAME, INSN)
+
+#define FUNC_P_OP_P(NAME, INSN) \
+FUNC_x_OP_x(uint64_t, uint64_t, NAME, INSN)
+
+#define FUNC_P_OP_R(NAME, INSN) \
+FUNC_x_OP_x(uint64_t, uint32_t, NAME, INSN)
+
+/*
+ * Template for instructions with a register and predicate result
+ * and one register operand
+ */
+#define FUNC_xp_OP_x(RESTYPE, SRCTYPE, NAME, INSN) \
+static RESTYPE NAME(SRCTYPE src, uint8_t *pred_result, uint32_t *usr_result) \
+{ \
+    RESTYPE result; \
+    uint8_t pred; \
+    uint32_t usr; \
+    asm(CLEAR_USRBITS \
+        INSN  "\n\t" \
+        "%1 = p2\n\t" \
+        "%2 = usr\n\t" \
+        : "=r"(result), "=r"(pred), "=r"(usr) \
+        : "r"(src) \
+        : "r2", "p2", "usr"); \
+    *pred_result = pred; \
+    *usr_result = usr & 0x3f; \
+    return result; \
+}
+
+#define FUNC_Rp_OP_R(NAME, INSN) \
+FUNC_xp_OP_x(uint32_t, uint32_t, NAME, INSN)
+
+/* Template for instructions with two register operands */
+#define FUNC_x_OP_xx(RESTYPE, SRC1TYPE, SRC2TYPE, NAME, INSN) \
+static RESTYPE NAME(SRC1TYPE src1, SRC2TYPE src2, uint32_t *usr_result) \
+{ \
+    RESTYPE result; \
+    uint32_t usr; \
+    asm(CLEAR_USRBITS \
+        INSN "\n\t" \
+        "%1 = usr\n\t" \
+        : "=r"(result), "=r"(usr) \
+        : "r"(src1), "r"(src2) \
+        : "r2", "usr"); \
+    *usr_result = usr & 0x3f; \
+    return result; \
+}
+
+#define FUNC_P_OP_PP(NAME, INSN) \
+FUNC_x_OP_xx(uint64_t, uint64_t, uint64_t, NAME, INSN)
+
+#define FUNC_R_OP_PP(NAME, INSN) \
+FUNC_x_OP_xx(uint32_t, uint64_t, uint64_t, NAME, INSN)
+
+#define FUNC_P_OP_RR(NAME, INSN) \
+FUNC_x_OP_xx(uint64_t, uint32_t, uint32_t, NAME, INSN)
+
+#define FUNC_R_OP_RR(NAME, INSN) \
+FUNC_x_OP_xx(uint32_t, uint32_t, uint32_t, NAME, INSN)
+
+#define FUNC_R_OP_PR(NAME, INSN) \
+FUNC_x_OP_xx(uint32_t, uint64_t, uint32_t, NAME, INSN)
+
+#define FUNC_P_OP_PR(NAME, INSN) \
+FUNC_x_OP_xx(uint64_t, uint64_t, uint32_t, NAME, INSN)
+
+/*
+ * Template for instructions with a register and predicate result
+ * and two register operands
+ */
+#define FUNC_xp_OP_xx(RESTYPE, SRC1TYPE, SRC2TYPE, NAME, INSN) \
+static RESTYPE NAME(SRC1TYPE src1, SRC2TYPE src2, \
+                    uint8_t *pred_result, uint32_t *usr_result) \
+{ \
+    RESTYPE result; \
+    uint8_t pred; \
+    uint32_t usr; \
+    asm(CLEAR_USRBITS \
+        INSN  "\n\t" \
+        "%1 = p2\n\t" \
+        "%2 = usr\n\t" \
+        : "=r"(result), "=r"(pred), "=r"(usr) \
+        : "r"(src1), "r"(src2) \
+        : "r2", "p2", "usr"); \
+    *pred_result = pred; \
+    *usr_result = usr & 0x3f; \
+    return result; \
+}
+
+#define FUNC_Rp_OP_RR(NAME, INSN) \
+FUNC_xp_OP_xx(uint32_t, uint32_t, uint32_t, NAME, INSN)
+
+/* Template for instructions with one register and one immediate */
+#define FUNC_x_OP_xI(RESTYPE, SRC1TYPE, NAME, INSN) \
+static RESTYPE NAME(SRC1TYPE src1, int32_t src2, uint32_t *usr_result) \
+{ \
+    RESTYPE result; \
+    uint32_t usr; \
+    asm(CLEAR_USRBITS \
+        INSN "\n\t" \
+        "%1 = usr\n\t" \
+        : "=r"(result), "=r"(usr) \
+        : "r"(src1), "i"(src2) \
+        : "r2", "usr"); \
+    *usr_result = usr & 0x3f; \
+    return result; \
+}
+
+#define FUNC_R_OP_RI(NAME, INSN) \
+FUNC_x_OP_xI(uint32_t, uint32_t, NAME, INSN)
+
+#define FUNC_R_OP_PI(NAME, INSN) \
+FUNC_x_OP_xI(uint32_t, uint64_t, NAME, INSN)
+
+/*
+ * Template for instructions with a read/write result
+ * and two register operands
+ */
+#define FUNC_Xx_OP_xx(RESTYPE, SRC1TYPE, SRC2TYPE, NAME, INSN) \
+static RESTYPE NAME(RESTYPE result, SRC1TYPE src1, SRC2TYPE src2, \
+                    uint32_t *usr_result) \
+{ \
+    uint32_t usr; \
+    asm(CLEAR_USRBITS \
+        INSN "\n\t" \
+        "%1 = usr\n\t" \
+        : "+r"(result), "=r"(usr) \
+        : "r"(src1), "r"(src2) \
+        : "r2", "usr"); \
+    *usr_result = usr & 0x3f; \
+    return result; \
+}
+
+#define FUNC_XR_OP_RR(NAME, INSN) \
+FUNC_Xx_OP_xx(uint32_t, uint32_t, uint32_t, NAME, INSN)
+
+#define FUNC_XP_OP_PP(NAME, INSN) \
+FUNC_Xx_OP_xx(uint64_t, uint64_t, uint64_t, NAME, INSN)
+
+#define FUNC_XP_OP_RR(NAME, INSN) \
+FUNC_Xx_OP_xx(uint64_t, uint32_t, uint32_t, NAME, INSN)
+
+/*
+ * Template for instructions with a read/write result
+ * and two register operands
+ */
+#define FUNC_Xxp_OP_xx(RESTYPE, SRC1TYPE, SRC2TYPE, NAME, INSN) \
+static RESTYPE NAME(RESTYPE result, SRC1TYPE src1, SRC2TYPE src2, \
+                    uint8_t *pred_result, uint32_t *usr_result) \
+{ \
+    uint32_t usr; \
+    uint8_t pred; \
+    asm(CLEAR_USRBITS \
+        INSN "\n\t" \
+        "%1 = p2\n\t" \
+        "%2 = usr\n\t" \
+        : "+r"(result), "=r"(pred), "=r"(usr) \
+        : "r"(src1), "r"(src2) \
+        : "r2", "usr"); \
+    *pred_result = pred; \
+    *usr_result = usr & 0x3f; \
+    return result; \
+}
+
+#define FUNC_XPp_OP_PP(NAME, INSN) \
+FUNC_Xxp_OP_xx(uint64_t, uint64_t, uint64_t, NAME, INSN)
+
+/*
+ * Template for instructions with a read/write result and
+ * two register and one predicate operands
+ */
+#define FUNC_Xx_OP_xxp(RESTYPE, SRC1TYPE, SRC2TYPE, NAME, INSN) \
+static RESTYPE NAME(RESTYPE result, SRC1TYPE src1, SRC2TYPE src2, uint8_t pred,\
+                    uint32_t *usr_result) \
+{ \
+    uint32_t usr; \
+    asm(CLEAR_USRBITS \
+        "p2 = %4\n\t" \
+        INSN "\n\t" \
+        "%1 = usr\n\t" \
+        : "+r"(result), "=r"(usr) \
+        : "r"(src1), "r"(src2), "r"(pred) \
+        : "r2", "p2", "usr"); \
+    *usr_result = usr & 0x3f; \
+    return result; \
+}
+
+#define FUNC_XR_OP_RRp(NAME, INSN) \
+FUNC_Xx_OP_xxp(uint32_t, uint32_t, uint32_t, NAME, INSN)
+
+/*
+ * Function declarations using the templates
+ */
+FUNC_R_OP_R(satub,              "%0 = satub(%2)")
+FUNC_P_OP_PP(vaddubs,           "%0 = vaddub(%2, %3):sat")
+FUNC_P_OP_PP(vadduhs,           "%0 = vadduh(%2, %3):sat")
+FUNC_P_OP_PP(vsububs,           "%0 = vsubub(%2, %3):sat")
+FUNC_P_OP_PP(vsubuhs,           "%0 = vsubuh(%2, %3):sat")
+
+/* Add vector of half integers with saturation and pack to unsigned bytes */
+FUNC_R_OP_PP(vaddhubs,          "%0 = vaddhub(%2, %3):sat")
+
+/* Vector saturate half to unsigned byte */
+FUNC_R_OP_P(vsathub,            "%0 = vsathub(%2)")
+
+/* Similar to above but takes a 32-bit argument */
+FUNC_R_OP_R(svsathub,           "%0 = vsathub(%2)")
+
+/* Vector saturate word to unsigned half */
+FUNC_P_OP_P(vsatwuh_nopack,     "%0 = vsatwuh(%2)")
+
+/* Similar to above but returns a 32-bit result */
+FUNC_R_OP_P(vsatwuh,            "%0 = vsatwuh(%2)")
+
+/* Vector arithmetic shift halfwords with saturate and pack */
+FUNC_R_OP_PI(asrhub_sat,        "%0 = vasrhub(%2, #%3):sat")
+
+/* Vector arithmetic shift halfwords with round, saturate and pack */
+FUNC_R_OP_PI(asrhub_rnd_sat,    "%0 = vasrhub(%2, #%3):raw")
+
+FUNC_R_OP_RR(addsat,            "%0 = add(%2, %3):sat")
+/* Similar to above but with register pairs */
+FUNC_P_OP_PP(addpsat,           "%0 = add(%2, %3):sat")
+
+FUNC_XR_OP_RR(mpy_acc_sat_hh_s0, "%0 += mpy(%2.H, %3.H):sat")
+FUNC_R_OP_RR(mpy_sat_hh_s1,     "%0 = mpy(%2.H, %3.H):<<1:sat")
+FUNC_R_OP_RR(mpy_sat_rnd_hh_s1, "%0 = mpy(%2.H, %3.H):<<1:rnd:sat")
+FUNC_R_OP_RR(mpy_up_s1_sat,     "%0 = mpy(%2, %3):<<1:sat")
+FUNC_P_OP_RR(vmpy2s_s1,         "%0 = vmpyh(%2, %3):<<1:sat")
+FUNC_P_OP_RR(vmpy2su_s1,        "%0 = vmpyhsu(%2, %3):<<1:sat")
+FUNC_R_OP_RR(vmpy2s_s1pack,     "%0 = vmpyh(%2, %3):<<1:rnd:sat")
+FUNC_P_OP_PP(vmpy2es_s1,        "%0 = vmpyeh(%2, %3):<<1:sat")
+FUNC_R_OP_PP(vdmpyrs_s1,        "%0 = vdmpy(%2, %3):<<1:rnd:sat")
+FUNC_XP_OP_PP(vdmacs_s0,        "%0 += vdmpy(%2, %3):sat")
+FUNC_R_OP_RR(cmpyrs_s0,         "%0 = cmpy(%2, %3):rnd:sat")
+FUNC_XP_OP_RR(cmacs_s0,         "%0 += cmpy(%2, %3):sat")
+FUNC_XP_OP_RR(cnacs_s0,         "%0 -= cmpy(%2, %3):sat")
+FUNC_P_OP_PP(vrcmpys_s1_h,      "%0 = vrcmpys(%2, %3):<<1:sat:raw:hi")
+FUNC_XP_OP_PP(mmacls_s0,        "%0 += vmpyweh(%2, %3):sat")
+FUNC_R_OP_RR(hmmpyl_rs1,        "%0 = mpy(%2, %3.L):<<1:rnd:sat")
+FUNC_XP_OP_PP(mmaculs_s0,       "%0 += vmpyweuh(%2, %3):sat")
+FUNC_R_OP_PR(cmpyi_wh,          "%0 = cmpyiwh(%2, %3):<<1:rnd:sat")
+FUNC_P_OP_PP(vcmpy_s0_sat_i,    "%0 = vcmpyi(%2, %3):sat")
+FUNC_P_OP_PR(vcrotate,          "%0 = vcrotate(%2, %3)")
+FUNC_P_OP_PR(vcnegh,            "%0 = vcnegh(%2, %3)")
+
+#if CORE_HAS_AUDIO
+FUNC_R_OP_PP(wcmpyrw,           "%0 = cmpyrw(%2, %3):<<1:sat")
+#endif
+
+FUNC_R_OP_RR(addh_l16_sat_ll,   "%0 = add(%2.L, %3.L):sat")
+FUNC_P_OP_P(vconj,              "%0 = vconj(%2):sat")
+FUNC_P_OP_PP(vxaddsubw,         "%0 = vxaddsubw(%2, %3):sat")
+FUNC_P_OP_P(vabshsat,           "%0 = vabsh(%2):sat")
+FUNC_P_OP_PP(vnavgwr,           "%0 = vnavgw(%2, %3):rnd:sat")
+FUNC_R_OP_RI(round_ri_sat,      "%0 = round(%2, #%3):sat")
+FUNC_R_OP_RR(asr_r_r_sat,       "%0 = asr(%2, %3):sat")
+
+FUNC_XPp_OP_PP(ACS,             "%0, p2 = vacsh(%3, %4)")
+
+/*
+ * Templates for test cases
+ *
+ * Same naming convention as the function templates
+ */
+#define TEST_x_OP_x(RESTYPE, CHECKFN, SRCTYPE, FUNC, SRC, RES, USR_RES) \
+    do { \
+        RESTYPE result; \
+        SRCTYPE src = SRC; \
+        uint32_t usr_result; \
+        result = FUNC(src, &usr_result); \
+        CHECKFN(result, RES); \
+        check(usr_result, USR_RES); \
+    } while (0)
+
+#define TEST_R_OP_R(FUNC, SRC, RES, USR_RES) \
+TEST_x_OP_x(uint32_t, check32, uint32_t, FUNC, SRC, RES, USR_RES)
+
+#define TEST_R_OP_P(FUNC, SRC, RES, USR_RES) \
+TEST_x_OP_x(uint32_t, check32, uint64_t, FUNC, SRC, RES, USR_RES)
+
+#define TEST_P_OP_P(FUNC, SRC, RES, USR_RES) \
+TEST_x_OP_x(uint64_t, check64, uint64_t, FUNC, SRC, RES, USR_RES)
+
+#define TEST_P_OP_R(FUNC, SRC, RES, USR_RES) \
+TEST_x_OP_x(uint64_t, check64, uint32_t, FUNC, SRC, RES, USR_RES)
+
+#define TEST_xp_OP_x(RESTYPE, CHECKFN, SRCTYPE, FUNC, SRC, \
+                     RES, PRED_RES, USR_RES) \
+    do { \
+        RESTYPE result; \
+        SRCTYPE src = SRC; \
+        uint8_t pred_result; \
+        uint32_t usr_result; \
+        result = FUNC(src, &pred_result, &usr_result); \
+        CHECKFN(result, RES); \
+        check(pred_result, PRED_RES); \
+        check(usr_result, USR_RES); \
+    } while (0)
+
+#define TEST_Rp_OP_R(FUNC, SRC, RES, PRED_RES, USR_RES) \
+TEST_xp_OP_x(uint32_t, check32, uint32_t, FUNC, SRC, RES, PRED_RES, USR_RES)
+
+#define TEST_x_OP_xx(RESTYPE, CHECKFN, SRC1TYPE, SRC2TYPE, \
+                     FUNC, SRC1, SRC2, RES, USR_RES) \
+    do { \
+        RESTYPE result; \
+        SRC1TYPE src1 = SRC1; \
+        SRC2TYPE src2 = SRC2; \
+        uint32_t usr_result; \
+        result = FUNC(src1, src2, &usr_result); \
+        CHECKFN(result, RES); \
+        check(usr_result, USR_RES); \
+    } while (0)
+
+#define TEST_P_OP_PP(FUNC, SRC1, SRC2, RES, USR_RES) \
+TEST_x_OP_xx(uint64_t, check64, uint64_t, uint64_t, \
+             FUNC, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_R_OP_PP(FUNC, SRC1, SRC2, RES, USR_RES) \
+TEST_x_OP_xx(uint32_t, check32, uint64_t, uint64_t, \
+             FUNC, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_P_OP_RR(FUNC, SRC1, SRC2, RES, USR_RES) \
+TEST_x_OP_xx(uint64_t, check64, uint32_t, uint32_t, \
+             FUNC, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_R_OP_RR(FUNC, SRC1, SRC2, RES, USR_RES) \
+TEST_x_OP_xx(uint32_t, check32, uint32_t, uint32_t, \
+             FUNC, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_R_OP_PR(FUNC, SRC1, SRC2, RES, USR_RES) \
+TEST_x_OP_xx(uint32_t, check32, uint64_t, uint32_t, \
+             FUNC, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_P_OP_PR(FUNC, SRC1, SRC2, RES, USR_RES) \
+TEST_x_OP_xx(uint64_t, check64, uint64_t, uint32_t, \
+             FUNC, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_xp_OP_xx(RESTYPE, CHECKFN, SRC1TYPE, SRC2TYPE, FUNC, SRC1, SRC2, \
+                      RES, PRED_RES, USR_RES) \
+    do { \
+        RESTYPE result; \
+        SRC1TYPE src1 = SRC1; \
+        SRC2TYPE src2 = SRC2; \
+        uint8_t pred_result; \
+        uint32_t usr_result; \
+        result = FUNC(src1, src2, &pred_result, &usr_result); \
+        CHECKFN(result, RES); \
+        check(pred_result, PRED_RES); \
+        check(usr_result, USR_RES); \
+    } while (0)
+
+#define TEST_Rp_OP_RR(FUNC, SRC1, SRC2, RES, PRED_RES, USR_RES) \
+TEST_xp_OP_xx(uint32_t, check32, uint32_t, uint32_t, FUNC, SRC1, SRC2, \
+              RES, PRED_RES, USR_RES)
+
+#define TEST_x_OP_xI(RESTYPE, CHECKFN, SRC1TYPE, \
+                     FUNC, SRC1, SRC2, RES, USR_RES) \
+    do { \
+        RESTYPE result; \
+        SRC1TYPE src1 = SRC1; \
+        uint32_t src2 = SRC2; \
+        uint32_t usr_result; \
+        result = FUNC(src1, src2, &usr_result); \
+        CHECKFN(result, RES); \
+        check(usr_result, USR_RES); \
+    } while (0)
+
+#define TEST_R_OP_RI(FUNC, SRC1, SRC2, RES, USR_RES) \
+TEST_x_OP_xI(uint32_t, check32, uint32_t, \
+             FUNC, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_R_OP_PI(FUNC, SRC1, SRC2, RES, USR_RES) \
+TEST_x_OP_xI(uint32_t, check64, uint64_t, \
+             FUNC, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_Xx_OP_xx(RESTYPE, CHECKFN, SRC1TYPE, SRC2TYPE, \
+                      FUNC, RESIN, SRC1, SRC2, RES, USR_RES) \
+    do { \
+        RESTYPE result = RESIN; \
+        SRC1TYPE src1 = SRC1; \
+        SRC2TYPE src2 = SRC2; \
+        uint32_t usr_result; \
+        result = FUNC(result, src1, src2, &usr_result); \
+        CHECKFN(result, RES); \
+        check(usr_result, USR_RES); \
+    } while (0)
+
+#define TEST_XR_OP_RR(FUNC, RESIN, SRC1, SRC2, RES, USR_RES) \
+TEST_Xx_OP_xx(uint32_t, check32, uint32_t, uint32_t, \
+              FUNC, RESIN, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_XP_OP_PP(FUNC, RESIN, SRC1, SRC2, RES, USR_RES) \
+TEST_Xx_OP_xx(uint64_t, check64, uint64_t, uint64_t, \
+              FUNC, RESIN, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_XP_OP_RR(FUNC, RESIN, SRC1, SRC2, RES, USR_RES) \
+TEST_Xx_OP_xx(uint64_t, check64, uint32_t, uint32_t, \
+              FUNC, RESIN, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_Xxp_OP_xx(RESTYPE, CHECKFN, SRC1TYPE, SRC2TYPE, \
+                       FUNC, RESIN, SRC1, SRC2, RES, PRED_RES, USR_RES) \
+    do { \
+        RESTYPE result = RESIN; \
+        SRC1TYPE src1 = SRC1; \
+        SRC2TYPE src2 = SRC2; \
+        uint8_t pred_res; \
+        uint32_t usr_result; \
+        result = FUNC(result, src1, src2, &pred_res, &usr_result); \
+        CHECKFN(result, RES); \
+        check(usr_result, USR_RES); \
+    } while (0)
+
+#define TEST_XPp_OP_PP(FUNC, RESIN, SRC1, SRC2, RES, PRED_RES, USR_RES) \
+TEST_Xxp_OP_xx(uint64_t, check64, uint64_t, uint64_t, FUNC, RESIN, SRC1, SRC2, \
+               RES, PRED_RES, USR_RES)
+
+#define TEST_Xx_OP_xxp(RESTYPE, CHECKFN, SRC1TYPE, SRC2TYPE, \
+                      FUNC, RESIN, SRC1, SRC2, PRED, RES, USR_RES) \
+    do { \
+        RESTYPE result = RESIN; \
+        SRC1TYPE src1 = SRC1; \
+        SRC2TYPE src2 = SRC2; \
+        uint8_t pred = PRED; \
+        uint32_t usr_result; \
+        result = FUNC(result, src1, src2, pred, &usr_result); \
+        CHECKFN(result, RES); \
+        check(usr_result, USR_RES); \
+    } while (0)
+
+#define TEST_XR_OP_RRp(FUNC, RESIN, SRC1, SRC2, PRED, RES, USR_RES) \
+TEST_Xx_OP_xxp(uint32_t, check32, uint32_t, uint32_t, \
+              FUNC, RESIN, SRC1, SRC2, PRED, RES, USR_RES)
+
+int main()
+{
+    TEST_R_OP_R(satub,       0,         0,         USR_CLEAR);
+    TEST_R_OP_R(satub,       0xff,      0xff,      USR_CLEAR);
+    TEST_R_OP_R(satub,       0xfff,     0xff,      USR_OVF);
+    TEST_R_OP_R(satub,       -1,        0,         USR_OVF);
+
+    TEST_P_OP_PP(vaddubs,    0xfeLL,    0x01LL,    0xffLL,    USR_CLEAR);
+    TEST_P_OP_PP(vaddubs,    0xffLL,    0xffLL,    0xffLL,    USR_OVF);
+
+    TEST_P_OP_PP(vadduhs,    0xfffeLL,  0x1LL,     0xffffLL,  USR_CLEAR);
+    TEST_P_OP_PP(vadduhs,    0xffffLL,  0x1LL,     0xffffLL,  USR_OVF);
+
+    TEST_P_OP_PP(vsububs, 0x0807060504030201LL, 0x0101010101010101LL,
+                 0x0706050403020100LL, USR_CLEAR);
+    TEST_P_OP_PP(vsububs, 0x0807060504030201LL, 0x0202020202020202LL,
+                 0x0605040302010000LL, USR_OVF);
+
+    TEST_P_OP_PP(vsubuhs, 0x0004000300020001LL, 0x0001000100010001LL,
+                 0x0003000200010000LL, USR_CLEAR);
+    TEST_P_OP_PP(vsubuhs, 0x0004000300020001LL, 0x0002000200020002LL,
+                 0x0002000100000000LL, USR_OVF);
+
+    TEST_R_OP_PP(vaddhubs, 0x0004000300020001LL, 0x0001000100010001LL,
+                 0x05040302, USR_CLEAR);
+    TEST_R_OP_PP(vaddhubs, 0x7fff000300020001LL, 0x0002000200020002LL,
+                 0xff050403, USR_OVF);
+
+    TEST_R_OP_P(vsathub,         0x0001000300020001LL, 0x01030201, USR_CLEAR);
+    TEST_R_OP_P(vsathub,         0x010000700080ffffLL, 0xff708000, USR_OVF);
+
+    TEST_R_OP_P(vsatwuh,         0x0000ffff00000001LL, 0xffff0001, USR_CLEAR);
+    TEST_R_OP_P(vsatwuh,         0x800000000000ffffLL, 0x0000ffff, USR_OVF);
+
+    TEST_P_OP_P(vsatwuh_nopack,  0x0000ffff00000001LL, 0x0000ffff00000001LL,
+                USR_CLEAR);
+    TEST_P_OP_P(vsatwuh_nopack,  0x800000000000ffffLL, 0x000000000000ffffLL,
+                USR_OVF);
+
+    TEST_R_OP_R(svsathub,        0x00020001,           0x0201,     USR_CLEAR);
+    TEST_R_OP_R(svsathub,        0x0080ffff,           0x8000,     USR_OVF);
+
+    TEST_R_OP_PI(asrhub_sat,     0x004f003f002f001fLL, 3,    0x09070503,
+                 USR_CLEAR);
+    TEST_R_OP_PI(asrhub_sat,     0x004fffff8fff001fLL, 3,    0x09000003,
+                 USR_OVF);
+
+    TEST_R_OP_PI(asrhub_rnd_sat, 0x004f003f002f001fLL, 2,    0x0a080604,
+                 USR_CLEAR);
+    TEST_R_OP_PI(asrhub_rnd_sat, 0x004fffff8fff001fLL, 2,    0x0a000004,
+                 USR_OVF);
+
+    TEST_R_OP_RR(addsat,        1,              2,              3,
+                 USR_CLEAR);
+    TEST_R_OP_RR(addsat,        0x7fffffff,     0x00000010,     0x7fffffff,
+                 USR_OVF);
+    TEST_R_OP_RR(addsat,        0x80000000,     0x80000006,     0x80000000,
+                 USR_OVF);
+
+    TEST_P_OP_PP(addpsat, 1LL, 2LL, 3LL, USR_CLEAR);
+    /* overflow to max positive */
+    TEST_P_OP_PP(addpsat, 0x7ffffffffffffff0LL, 0x0000000000000010LL,
+                 0x7fffffffffffffffLL, USR_OVF);
+    /* overflow to min negative */
+    TEST_P_OP_PP(addpsat, 0x8000000000000003LL, 0x8000000000000006LL,
+                 0x8000000000000000LL, USR_OVF);
+
+    TEST_XR_OP_RR(mpy_acc_sat_hh_s0, 0x7fffffff, 0xffff0000, 0x11110000,
+                  0x7fffeeee, USR_CLEAR);
+    TEST_XR_OP_RR(mpy_acc_sat_hh_s0, 0x7fffffff, 0x7fff0000, 0x7fff0000,
+                  0x7fffffff, USR_OVF);
+
+    TEST_R_OP_RR(mpy_sat_hh_s1,        0xffff0000, 0x11110000, 0xffffddde,
+                 USR_CLEAR);
+    TEST_R_OP_RR(mpy_sat_hh_s1,        0x7fff0000, 0x7fff0000, 0x7ffe0002,
+                 USR_CLEAR);
+    TEST_R_OP_RR(mpy_sat_hh_s1,        0x80000000, 0x80000000, 0x7fffffff,
+                 USR_OVF);
+
+    TEST_R_OP_RR(mpy_sat_rnd_hh_s1,    0xffff0000, 0x11110000, 0x00005dde,
+                 USR_CLEAR);
+    TEST_R_OP_RR(mpy_sat_rnd_hh_s1,    0x7fff0000, 0x7fff0000, 0x7ffe8002,
+                 USR_CLEAR);
+    TEST_R_OP_RR(mpy_sat_rnd_hh_s1,    0x80000000, 0x80000000, 0x7fffffff,
+                 USR_OVF);
+
+    TEST_R_OP_RR(mpy_up_s1_sat,        0xffff0000, 0x11110000, 0xffffddde,
+                 USR_CLEAR);
+    TEST_R_OP_RR(mpy_up_s1_sat,        0x7fff0000, 0x7fff0000, 0x7ffe0002,
+                 USR_CLEAR);
+    TEST_R_OP_RR(mpy_up_s1_sat,        0x80000000, 0x80000000, 0x7fffffff,
+                 USR_OVF);
+
+    TEST_P_OP_RR(vmpy2s_s1,  0x7fff0000, 0x7fff0000, 0x7ffe000200000000LL,
+                 USR_CLEAR);
+    TEST_P_OP_RR(vmpy2s_s1,  0x80000000, 0x80000000, 0x7fffffff00000000LL,
+                 USR_OVF);
+
+    TEST_P_OP_RR(vmpy2su_s1, 0x7fff0000, 0x7fff0000, 0x7ffe000200000000LL,
+                 USR_CLEAR);
+    TEST_P_OP_RR(vmpy2su_s1, 0xffffbd97, 0xffffffff, 0xfffe000280000000LL,
+                 USR_OVF);
+
+    TEST_R_OP_RR(vmpy2s_s1pack,        0x7fff0000, 0x7fff0000, 0x7ffe0000,
+                 USR_CLEAR);
+    TEST_R_OP_RR(vmpy2s_s1pack,        0x80008000, 0x80008000, 0x7fff7fff,
+                 USR_OVF);
+
+    TEST_P_OP_PP(vmpy2es_s1, 0x7fff7fff7fff7fffLL, 0x1fff1fff1fff1fffLL,
+                 0x1ffec0021ffec002LL, USR_CLEAR);
+    TEST_P_OP_PP(vmpy2es_s1, 0x8000800080008000LL, 0x8000800080008000LL,
+                 0x7fffffff7fffffffLL, USR_OVF);
+
+    TEST_R_OP_PP(vdmpyrs_s1, 0x7fff7fff7fff7fffLL, 0x1fff1fff1fff1fffLL,
+                 0x3ffe3ffe, USR_CLEAR);
+    TEST_R_OP_PP(vdmpyrs_s1, 0x8000800080008000LL, 0x8000800080008000LL,
+                 0x7fff7fffLL, USR_OVF);
+
+    TEST_XP_OP_PP(vdmacs_s0, 0x0fffffffULL, 0x00ff00ff00ff00ffLL,
+                  0x00ff00ff00ff00ffLL, 0x0001fc021001fc01LL, USR_CLEAR);
+    TEST_XP_OP_PP(vdmacs_s0, 0x01111111ULL, 0x8000800080001000LL,
+                  0x8000800080008000LL, 0x7fffffff39111111LL, USR_OVF);
+
+    TEST_R_OP_RR(cmpyrs_s0,            0x7fff0000, 0x7fff0000, 0x0000c001,
+                 USR_CLEAR);
+    TEST_R_OP_RR(cmpyrs_s0,            0x80008000, 0x80008000, 0x7fff0000,
+                 USR_OVF);
+
+    TEST_XP_OP_RR(cmacs_s0, 0x0fffffff, 0x7fff0000, 0x7fff0000,
+                  0x00000000d000fffeLL, USR_CLEAR);
+    TEST_XP_OP_RR(cmacs_s0, 0x0fff1111, 0x80008000, 0x80008000,
+                  0x7fffffff0fff1111LL, USR_OVF);
+
+    TEST_XP_OP_RR(cnacs_s0, 0x000000108fffffffULL, 0x7fff0000, 0x7fff0000,
+                  0x00000010cfff0000ULL, USR_CLEAR);
+    TEST_XP_OP_RR(cnacs_s0, 0x000000108ff1111fULL, 0x00002001, 0x00007ffd,
+                  0x0000001080000000ULL, USR_OVF);
+
+    TEST_P_OP_PP(vrcmpys_s1_h, 0x00ff00ff00ff00ffLL, 0x00ff00ff00ff00ffLL,
+                 0x0003f8040003f804LL, USR_CLEAR);
+    TEST_P_OP_PP(vrcmpys_s1_h, 0x8000800080008000LL, 0x8000800080008000LL,
+                 0x7fffffff7fffffffLL, USR_OVF);
+
+    TEST_XP_OP_PP(mmacls_s0, 0x6fffffff, 0x00ff00ff00ff00ffLL,
+                  0x00ff00ff00ff00ffLL, 0x0000fe017000fe00LL, USR_CLEAR);
+    TEST_XP_OP_PP(mmacls_s0, 0x6f1111ff, 0x8000800080008000LL,
+                  0x1000100080008000LL, 0xf80008007fffffffLL, USR_OVF);
+
+    TEST_R_OP_RR(hmmpyl_rs1,           0x7fff0000, 0x7fff0001, 0x0000fffe,
+                 USR_CLEAR);
+    TEST_R_OP_RR(hmmpyl_rs1,           0x80000000, 0x80008000, 0x7fffffff,
+                 USR_OVF);
+
+    TEST_XP_OP_PP(mmaculs_s0, 0x000000007fffffffULL, 0xffff800080008000LL,
+                  0xffff800080008000LL, 0xffffc00040003fffLL, USR_CLEAR);
+    TEST_XP_OP_PP(mmaculs_s0, 0x000011107fffffffULL, 0x00ff00ff00ff00ffLL,
+                  0x00ff00ff001100ffLL, 0x00010f117fffffffLL, USR_OVF);
+
+    TEST_R_OP_PR(cmpyi_wh, 0x7fff000000000000LL, 0x7fff0001, 0x0000fffe,
+                 USR_CLEAR);
+    TEST_R_OP_PR(cmpyi_wh, 0x8000000000000000LL, 0x80008000, 0x7fffffff,
+                 USR_OVF);
+
+    TEST_P_OP_PP(vcmpy_s0_sat_i, 0x00ff00ff00ff00ffLL, 0x00ff00ff00ff00ffLL,
+                 0x0001fc020001fc02LL, USR_CLEAR);
+    TEST_P_OP_PP(vcmpy_s0_sat_i, 0x8000800080008000LL, 0x8000800080008000LL,
+                 0x7fffffff7fffffffLL, USR_OVF);
+
+    TEST_P_OP_PR(vcrotate, 0x8000000000000000LL, 0x00000002,
+                 0x8000000000000000LL, USR_CLEAR);
+    TEST_P_OP_PR(vcrotate, 0x7fff80007fff8000LL, 0x00000001,
+                 0x7fff80007fff7fffLL, USR_OVF);
+
+    TEST_P_OP_PR(vcnegh, 0x8000000000000000LL, 0x00000002,
+                 0x8000000000000000LL, USR_CLEAR);
+    TEST_P_OP_PR(vcnegh, 0x7fff80007fff8000LL, 0x00000001,
+                 0x7fff80007fff7fffLL, USR_OVF);
+
+#if CORE_HAS_AUDIO
+    TEST_R_OP_PP(wcmpyrw, 0x8765432101234567LL, 0x00000002ffffffffLL,
+                 0x00000001, USR_CLEAR);
+    TEST_R_OP_PP(wcmpyrw, 0x800000007fffffffLL, 0x000000ff7fffffffLL,
+                 0x7fffffff, USR_OVF);
+    TEST_R_OP_PP(wcmpyrw, 0x7fffffff80000000LL, 0x7fffffff000000ffLL,
+                 0x80000000, USR_OVF);
+#else
+    printf("Audio instructions skipped\n");
+#endif
+
+    TEST_R_OP_RR(addh_l16_sat_ll,      0x0000ffff, 0x00000002, 0x00000001,
+                 USR_CLEAR);
+    TEST_R_OP_RR(addh_l16_sat_ll,      0x00007fff, 0x00000005, 0x00007fff,
+                 USR_OVF);
+    TEST_R_OP_RR(addh_l16_sat_ll,      0x00008000, 0x00008000, 0xffff8000,
+                 USR_OVF);
+
+    TEST_P_OP_P(vconj, 0x0000ffff00000001LL, 0x0000ffff00000001LL, USR_CLEAR);
+    TEST_P_OP_P(vconj, 0x800000000000ffffLL, 0x7fff00000000ffffLL, USR_OVF);
+
+    TEST_P_OP_PP(vxaddsubw, 0x8765432101234567LL, 0x00000002ffffffffLL,
+                 0x8765432201234569LL, USR_CLEAR);
+    TEST_P_OP_PP(vxaddsubw, 0x7fffffff7fffffffLL, 0xffffffffffffffffLL,
+                 0x7fffffff7ffffffeLL, USR_OVF);
+    TEST_P_OP_PP(vxaddsubw, 0x800000000fffffffLL, 0x0000000a00000008LL,
+                 0x8000000010000009LL, USR_OVF);
+
+    TEST_P_OP_P(vabshsat, 0x0001000afffff800LL, 0x0001000a00010800LL,
+                USR_CLEAR);
+    TEST_P_OP_P(vabshsat, 0x8000000b000c000aLL, 0x7fff000b000c000aLL,
+             USR_OVF);
+
+    TEST_P_OP_PP(vnavgwr, 0x8765432101234567LL, 0x00000002ffffffffLL,
+                 0xc3b2a1900091a2b4LL, USR_CLEAR);
+    TEST_P_OP_PP(vnavgwr, 0x7fffffff8000000aLL, 0x80000000ffffffffLL,
+                 0x7fffffffc0000006LL, USR_OVF);
+
+    TEST_R_OP_RI(round_ri_sat,         0x0000ffff, 2, 0x00004000, USR_CLEAR);
+    TEST_R_OP_RI(round_ri_sat,         0x7fffffff, 2, 0x1fffffff, USR_OVF);
+
+    TEST_R_OP_RR(asr_r_r_sat,          0x0000ffff, 0x00000002, 0x00003fff,
+                 USR_CLEAR);
+    TEST_R_OP_RR(asr_r_r_sat,          0x00ffffff, 0xfffffff5, 0x7fffffff,
+                 USR_OVF);
+    TEST_R_OP_RR(asr_r_r_sat,          0x80000000, 0xfffffff5, 0x80000000,
+                 USR_OVF);
+
+    TEST_XPp_OP_PP(ACS, 0x0004000300020001ULL, 0x0001000200030004ULL,
+                   0x0000000000000000ULL, 0x0004000300030004ULL, 0xf0,
+                   USR_CLEAR);
+    TEST_XPp_OP_PP(ACS, 0x0004000300020001ULL, 0x0001000200030004ULL,
+                   0x000affff000d0000ULL, 0x000e0003000f0004ULL, 0xcc,
+                   USR_CLEAR);
+    TEST_XPp_OP_PP(ACS, 0x00047fff00020001ULL, 0x00017fff00030004ULL,
+                  0x000a0fff000d0000ULL, 0x000e7fff000f0004ULL, 0xfc,
+                  USR_OVF);
+    TEST_XPp_OP_PP(ACS, 0x00047fff00020001ULL, 0x00017fff00030004ULL,
+                   0x000a0fff000d0000ULL, 0x000e7fff000f0004ULL, 0xf0,
+                   USR_OVF);
+
+    puts(err ? "FAIL" : "PASS");
+    return err;
+}
-- 
cgit v1.1


From 2479540fff4aa4519ff45e122be360492f970598 Mon Sep 17 00:00:00 2001
From: Taylor Simpson <tsimpson@quicinc.com>
Date: Wed, 9 Feb 2022 18:15:51 -0800
Subject: Hexagon (tests/tcg/hexagon) add floating point instructions to usr.c

Tests to confirm floating point instructions are properly
setting exception bits in USR

Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <20220210021556.9217-8-tsimpson@quicinc.com>
Acked-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/tcg/hexagon/usr.c | 339 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 339 insertions(+)

(limited to 'tests')

diff --git a/tests/tcg/hexagon/usr.c b/tests/tcg/hexagon/usr.c
index e827272..11415f8 100644
--- a/tests/tcg/hexagon/usr.c
+++ b/tests/tcg/hexagon/usr.c
@@ -78,6 +78,34 @@ static void __check64(int line, uint64_t val, uint64_t expect)
 #define USR_FPUNFF           (1 << USR_FPUNFF_BIT)
 #define USR_FPINPF           (1 << USR_FPINPF_BIT)
 
+/* Some useful floating point values */
+const uint32_t SF_INF =              0x7f800000;
+const uint32_t SF_QNaN =             0x7fc00000;
+const uint32_t SF_SNaN =             0x7fb00000;
+const uint32_t SF_QNaN_neg =         0xffc00000;
+const uint32_t SF_SNaN_neg =         0xffb00000;
+const uint32_t SF_HEX_NaN =          0xffffffff;
+const uint32_t SF_zero =             0x00000000;
+const uint32_t SF_one =              0x3f800000;
+const uint32_t SF_one_recip =        0x3f7f0001;         /* 0.9960...  */
+const uint32_t SF_one_invsqrta =     0x3f7f0000;         /* 0.99609375 */
+const uint32_t SF_two =              0x40000000;
+const uint32_t SF_four =             0x40800000;
+const uint32_t SF_small_neg =        0xab98fba8;
+const uint32_t SF_large_pos =        0x5afa572e;
+
+const uint64_t DF_QNaN =             0x7ff8000000000000ULL;
+const uint64_t DF_SNaN =             0x7ff7000000000000ULL;
+const uint64_t DF_QNaN_neg =         0xfff8000000000000ULL;
+const uint64_t DF_SNaN_neg =         0xfff7000000000000ULL;
+const uint64_t DF_HEX_NaN =          0xffffffffffffffffULL;
+const uint64_t DF_zero =             0x0000000000000000ULL;
+const uint64_t DF_any =              0x3f80000000000000ULL;
+const uint64_t DF_one =              0x3ff0000000000000ULL;
+const uint64_t DF_one_hh =           0x3ff001ff80000000ULL;     /* 1.00048... */
+const uint64_t DF_small_neg =        0xbd731f7500000000ULL;
+const uint64_t DF_large_pos =        0x7f80000000000001ULL;
+
 /*
  * Templates for functions to execute an instruction
  *
@@ -309,6 +337,29 @@ static RESTYPE NAME(RESTYPE result, SRC1TYPE src1, SRC2TYPE src2, uint8_t pred,\
 #define FUNC_XR_OP_RRp(NAME, INSN) \
 FUNC_Xx_OP_xxp(uint32_t, uint32_t, uint32_t, NAME, INSN)
 
+/* Template for compare instructions with two register operands */
+#define FUNC_CMP_xx(SRC1TYPE, SRC2TYPE, NAME, INSN) \
+static uint32_t NAME(SRC1TYPE src1, SRC2TYPE src2, uint32_t *usr_result) \
+{ \
+    uint32_t result; \
+    uint32_t usr; \
+    asm(CLEAR_USRBITS \
+        INSN "\n\t" \
+        "%0 = p1\n\t" \
+        "%1 = usr\n\t" \
+        : "=r"(result), "=r"(usr) \
+        : "r"(src1), "r"(src2) \
+        : "p1", "r2", "usr"); \
+    *usr_result = usr & 0x3f; \
+    return result; \
+}
+
+#define FUNC_CMP_RR(NAME, INSN) \
+FUNC_CMP_xx(uint32_t, uint32_t, NAME, INSN)
+
+#define FUNC_CMP_PP(NAME, INSN) \
+FUNC_CMP_xx(uint64_t, uint64_t, NAME, INSN)
+
 /*
  * Function declarations using the templates
  */
@@ -379,6 +430,69 @@ FUNC_R_OP_RR(asr_r_r_sat,       "%0 = asr(%2, %3):sat")
 
 FUNC_XPp_OP_PP(ACS,             "%0, p2 = vacsh(%3, %4)")
 
+/* Floating point */
+FUNC_R_OP_RR(sfmin,             "%0 = sfmin(%2, %3)")
+FUNC_R_OP_RR(sfmax,             "%0 = sfmax(%2, %3)")
+FUNC_R_OP_RR(sfadd,             "%0 = sfadd(%2, %3)")
+FUNC_R_OP_RR(sfsub,             "%0 = sfsub(%2, %3)")
+FUNC_R_OP_RR(sfmpy,             "%0 = sfmpy(%2, %3)")
+FUNC_XR_OP_RR(sffma,            "%0 += sfmpy(%2, %3)")
+FUNC_XR_OP_RR(sffms,            "%0 -= sfmpy(%2, %3)")
+FUNC_CMP_RR(sfcmpuo,            "p1 = sfcmp.uo(%2, %3)")
+FUNC_CMP_RR(sfcmpeq,            "p1 = sfcmp.eq(%2, %3)")
+FUNC_CMP_RR(sfcmpgt,            "p1 = sfcmp.gt(%2, %3)")
+FUNC_CMP_RR(sfcmpge,            "p1 = sfcmp.ge(%2, %3)")
+
+FUNC_P_OP_PP(dfadd,             "%0 = dfadd(%2, %3)")
+FUNC_P_OP_PP(dfsub,             "%0 = dfsub(%2, %3)")
+
+#if CORE_IS_V67
+FUNC_P_OP_PP(dfmin,             "%0 = dfmin(%2, %3)")
+FUNC_P_OP_PP(dfmax,             "%0 = dfmax(%2, %3)")
+FUNC_XP_OP_PP(dfmpyhh,          "%0 += dfmpyhh(%2, %3)")
+#endif
+
+FUNC_CMP_PP(dfcmpuo,            "p1 = dfcmp.uo(%2, %3)")
+FUNC_CMP_PP(dfcmpeq,            "p1 = dfcmp.eq(%2, %3)")
+FUNC_CMP_PP(dfcmpgt,            "p1 = dfcmp.gt(%2, %3)")
+FUNC_CMP_PP(dfcmpge,            "p1 = dfcmp.ge(%2, %3)")
+
+/* Conversions from sf */
+FUNC_P_OP_R(conv_sf2df,         "%0 = convert_sf2df(%2)")
+FUNC_R_OP_R(conv_sf2uw,         "%0 = convert_sf2uw(%2)")
+FUNC_R_OP_R(conv_sf2w,          "%0 = convert_sf2w(%2)")
+FUNC_P_OP_R(conv_sf2ud,         "%0 = convert_sf2ud(%2)")
+FUNC_P_OP_R(conv_sf2d,          "%0 = convert_sf2d(%2)")
+FUNC_R_OP_R(conv_sf2uw_chop,    "%0 = convert_sf2uw(%2):chop")
+FUNC_R_OP_R(conv_sf2w_chop,     "%0 = convert_sf2w(%2):chop")
+FUNC_P_OP_R(conv_sf2ud_chop,    "%0 = convert_sf2ud(%2):chop")
+FUNC_P_OP_R(conv_sf2d_chop,     "%0 = convert_sf2d(%2):chop")
+
+/* Conversions from df */
+FUNC_R_OP_P(conv_df2sf,         "%0 = convert_df2sf(%2)")
+FUNC_R_OP_P(conv_df2uw,         "%0 = convert_df2uw(%2)")
+FUNC_R_OP_P(conv_df2w,          "%0 = convert_df2w(%2)")
+FUNC_P_OP_P(conv_df2ud,         "%0 = convert_df2ud(%2)")
+FUNC_P_OP_P(conv_df2d,          "%0 = convert_df2d(%2)")
+FUNC_R_OP_P(conv_df2uw_chop,    "%0 = convert_df2uw(%2):chop")
+FUNC_R_OP_P(conv_df2w_chop,     "%0 = convert_df2w(%2):chop")
+FUNC_P_OP_P(conv_df2ud_chop,    "%0 = convert_df2ud(%2):chop")
+FUNC_P_OP_P(conv_df2d_chop,     "%0 = convert_df2d(%2):chop")
+
+/* Integer to float conversions */
+FUNC_R_OP_R(conv_uw2sf,         "%0 = convert_uw2sf(%2)")
+FUNC_R_OP_R(conv_w2sf,          "%0 = convert_w2sf(%2)")
+FUNC_R_OP_P(conv_ud2sf,         "%0 = convert_ud2sf(%2)")
+FUNC_R_OP_P(conv_d2sf,          "%0 = convert_d2sf(%2)")
+
+/* Special purpose floating point instructions */
+FUNC_XR_OP_RRp(sffma_sc,        "%0 += sfmpy(%2, %3, p2):scale")
+FUNC_Rp_OP_RR(sfrecipa,         "%0, p2 = sfrecipa(%3, %4)")
+FUNC_R_OP_RR(sffixupn,          "%0 = sffixupn(%2, %3)")
+FUNC_R_OP_RR(sffixupd,          "%0 = sffixupd(%2, %3)")
+FUNC_R_OP_R(sffixupr,           "%0 = sffixupr(%2)")
+FUNC_Rp_OP_R(sfinvsqrta,        "%0, p2 = sfinvsqrta(%3)")
+
 /*
  * Templates for test cases
  *
@@ -554,6 +668,24 @@ TEST_Xxp_OP_xx(uint64_t, check64, uint64_t, uint64_t, FUNC, RESIN, SRC1, SRC2, \
 TEST_Xx_OP_xxp(uint32_t, check32, uint32_t, uint32_t, \
               FUNC, RESIN, SRC1, SRC2, PRED, RES, USR_RES)
 
+#define TEST_CMP_xx(SRC1TYPE, SRC2TYPE, \
+                    FUNC, SRC1, SRC2, RES, USR_RES) \
+    do { \
+        uint32_t result; \
+        SRC1TYPE src1 = SRC1; \
+        SRC2TYPE src2 = SRC2; \
+        uint32_t usr_result; \
+        result = FUNC(src1, src2, &usr_result); \
+        check(result, RES); \
+        check(usr_result, USR_RES); \
+    } while (0)
+
+#define TEST_CMP_RR(FUNC, SRC1, SRC2, RES, USR_RES) \
+TEST_CMP_xx(uint32_t, uint32_t, FUNC, SRC1, SRC2, RES, USR_RES)
+
+#define TEST_CMP_PP(FUNC, SRC1, SRC2, RES, USR_RES) \
+TEST_CMP_xx(uint64_t, uint64_t, FUNC, SRC1, SRC2, RES, USR_RES)
+
 int main()
 {
     TEST_R_OP_R(satub,       0,         0,         USR_CLEAR);
@@ -793,6 +925,213 @@ int main()
                    0x000a0fff000d0000ULL, 0x000e7fff000f0004ULL, 0xf0,
                    USR_OVF);
 
+    /* Floating point */
+    TEST_R_OP_RR(sfmin,  SF_one,      SF_small_neg,   SF_small_neg, USR_CLEAR);
+    TEST_R_OP_RR(sfmin,  SF_one,      SF_SNaN,        SF_one,       USR_FPINVF);
+    TEST_R_OP_RR(sfmin,  SF_SNaN,     SF_one,         SF_one,       USR_FPINVF);
+    TEST_R_OP_RR(sfmin,  SF_one,      SF_QNaN,        SF_one,       USR_CLEAR);
+    TEST_R_OP_RR(sfmin,  SF_QNaN,     SF_one,         SF_one,       USR_CLEAR);
+    TEST_R_OP_RR(sfmin,  SF_SNaN,     SF_QNaN,        SF_HEX_NaN,   USR_FPINVF);
+    TEST_R_OP_RR(sfmin,  SF_QNaN,     SF_SNaN,        SF_HEX_NaN,   USR_FPINVF);
+
+    TEST_R_OP_RR(sfmax,  SF_one,      SF_small_neg,   SF_one,       USR_CLEAR);
+    TEST_R_OP_RR(sfmax,  SF_one,      SF_SNaN,        SF_one,       USR_FPINVF);
+    TEST_R_OP_RR(sfmax,  SF_SNaN,     SF_one,         SF_one,       USR_FPINVF);
+    TEST_R_OP_RR(sfmax,  SF_one,      SF_QNaN,        SF_one,       USR_CLEAR);
+    TEST_R_OP_RR(sfmax,  SF_QNaN,     SF_one,         SF_one,       USR_CLEAR);
+    TEST_R_OP_RR(sfmax,  SF_SNaN,     SF_QNaN,        SF_HEX_NaN,   USR_FPINVF);
+    TEST_R_OP_RR(sfmax,  SF_QNaN,     SF_SNaN,        SF_HEX_NaN,   USR_FPINVF);
+
+    TEST_R_OP_RR(sfadd,  SF_one,      SF_QNaN,        SF_HEX_NaN,   USR_CLEAR);
+    TEST_R_OP_RR(sfadd,  SF_one,      SF_SNaN,        SF_HEX_NaN,   USR_FPINVF);
+    TEST_R_OP_RR(sfadd,  SF_QNaN,     SF_SNaN,        SF_HEX_NaN,   USR_FPINVF);
+    TEST_R_OP_RR(sfadd,  SF_SNaN,     SF_QNaN,        SF_HEX_NaN,   USR_FPINVF);
+
+    TEST_R_OP_RR(sfsub,  SF_one,      SF_QNaN,        SF_HEX_NaN,   USR_CLEAR);
+    TEST_R_OP_RR(sfsub,  SF_one,      SF_SNaN,        SF_HEX_NaN,   USR_FPINVF);
+    TEST_R_OP_RR(sfsub,  SF_QNaN,     SF_SNaN,        SF_HEX_NaN,   USR_FPINVF);
+    TEST_R_OP_RR(sfsub,  SF_SNaN,     SF_QNaN,        SF_HEX_NaN,   USR_FPINVF);
+
+    TEST_R_OP_RR(sfmpy,  SF_one,      SF_QNaN,        SF_HEX_NaN,   USR_CLEAR);
+    TEST_R_OP_RR(sfmpy,  SF_one,      SF_SNaN,        SF_HEX_NaN,   USR_FPINVF);
+    TEST_R_OP_RR(sfmpy,  SF_QNaN,     SF_SNaN,        SF_HEX_NaN,   USR_FPINVF);
+    TEST_R_OP_RR(sfmpy,  SF_SNaN,     SF_QNaN,        SF_HEX_NaN,   USR_FPINVF);
+
+    TEST_XR_OP_RR(sffma, SF_one,   SF_one,    SF_one,   SF_two,     USR_CLEAR);
+    TEST_XR_OP_RR(sffma, SF_zero,  SF_one,    SF_QNaN,  SF_HEX_NaN, USR_CLEAR);
+    TEST_XR_OP_RR(sffma, SF_zero,  SF_one,    SF_SNaN,  SF_HEX_NaN, USR_FPINVF);
+    TEST_XR_OP_RR(sffma, SF_zero,  SF_QNaN,   SF_SNaN,  SF_HEX_NaN, USR_FPINVF);
+    TEST_XR_OP_RR(sffma, SF_zero,  SF_SNaN,   SF_QNaN,  SF_HEX_NaN, USR_FPINVF);
+
+    TEST_XR_OP_RR(sffms, SF_one,   SF_one,    SF_one,   SF_zero,    USR_CLEAR);
+    TEST_XR_OP_RR(sffms, SF_zero,  SF_one,    SF_QNaN,  SF_HEX_NaN, USR_CLEAR);
+    TEST_XR_OP_RR(sffms, SF_zero,  SF_one,    SF_SNaN,  SF_HEX_NaN, USR_FPINVF);
+    TEST_XR_OP_RR(sffms, SF_zero,  SF_QNaN,   SF_SNaN,  SF_HEX_NaN, USR_FPINVF);
+    TEST_XR_OP_RR(sffms, SF_zero,  SF_SNaN,   SF_QNaN,  SF_HEX_NaN, USR_FPINVF);
+
+    TEST_CMP_RR(sfcmpuo, SF_one,      SF_large_pos,    0x00,    USR_CLEAR);
+    TEST_CMP_RR(sfcmpuo, SF_INF,      SF_large_pos,    0x00,    USR_CLEAR);
+    TEST_CMP_RR(sfcmpuo, SF_QNaN,     SF_large_pos,    0xff,    USR_CLEAR);
+    TEST_CMP_RR(sfcmpuo, SF_QNaN_neg, SF_large_pos,    0xff,    USR_CLEAR);
+    TEST_CMP_RR(sfcmpuo, SF_SNaN,     SF_large_pos,    0xff,    USR_FPINVF);
+    TEST_CMP_RR(sfcmpuo, SF_SNaN_neg, SF_large_pos,    0xff,    USR_FPINVF);
+    TEST_CMP_RR(sfcmpuo, SF_QNaN,     SF_QNaN,         0xff,    USR_CLEAR);
+    TEST_CMP_RR(sfcmpuo, SF_QNaN,     SF_SNaN,         0xff,    USR_FPINVF);
+
+    TEST_CMP_RR(sfcmpeq, SF_one,      SF_QNaN,         0x00,    USR_CLEAR);
+    TEST_CMP_RR(sfcmpeq, SF_one,      SF_SNaN,         0x00,    USR_FPINVF);
+    TEST_CMP_RR(sfcmpgt, SF_one,      SF_QNaN,         0x00,    USR_CLEAR);
+    TEST_CMP_RR(sfcmpgt, SF_one,      SF_SNaN,         0x00,    USR_FPINVF);
+    TEST_CMP_RR(sfcmpge, SF_one,      SF_QNaN,         0x00,    USR_CLEAR);
+    TEST_CMP_RR(sfcmpge, SF_one,      SF_SNaN,         0x00,    USR_FPINVF);
+
+    TEST_P_OP_PP(dfadd,  DF_any,    DF_QNaN,         DF_HEX_NaN,    USR_CLEAR);
+    TEST_P_OP_PP(dfadd,  DF_any,    DF_SNaN,         DF_HEX_NaN,    USR_FPINVF);
+    TEST_P_OP_PP(dfadd,  DF_QNaN,   DF_SNaN,         DF_HEX_NaN,    USR_FPINVF);
+    TEST_P_OP_PP(dfadd,  DF_SNaN,   DF_QNaN,         DF_HEX_NaN,    USR_FPINVF);
+
+    TEST_P_OP_PP(dfsub,  DF_any,    DF_QNaN,         DF_HEX_NaN,    USR_CLEAR);
+    TEST_P_OP_PP(dfsub,  DF_any,    DF_SNaN,         DF_HEX_NaN,    USR_FPINVF);
+    TEST_P_OP_PP(dfsub,  DF_QNaN,   DF_SNaN,         DF_HEX_NaN,    USR_FPINVF);
+    TEST_P_OP_PP(dfsub,  DF_SNaN,   DF_QNaN,         DF_HEX_NaN,    USR_FPINVF);
+
+#if CORE_IS_V67
+    TEST_P_OP_PP(dfmin,  DF_any,    DF_small_neg,    DF_small_neg,  USR_CLEAR);
+    TEST_P_OP_PP(dfmin,  DF_any,    DF_SNaN,         DF_any,        USR_FPINVF);
+    TEST_P_OP_PP(dfmin,  DF_SNaN,   DF_any,          DF_any,        USR_FPINVF);
+    TEST_P_OP_PP(dfmin,  DF_any,    DF_QNaN,         DF_any,        USR_CLEAR);
+    TEST_P_OP_PP(dfmin,  DF_QNaN,   DF_any,          DF_any,        USR_CLEAR);
+    TEST_P_OP_PP(dfmin,  DF_SNaN,   DF_QNaN,         DF_HEX_NaN,    USR_FPINVF);
+    TEST_P_OP_PP(dfmin,  DF_QNaN,   DF_SNaN,         DF_HEX_NaN,    USR_FPINVF);
+
+    TEST_P_OP_PP(dfmax,  DF_any,    DF_small_neg,    DF_any,        USR_CLEAR);
+    TEST_P_OP_PP(dfmax,  DF_any,    DF_SNaN,         DF_any,        USR_FPINVF);
+    TEST_P_OP_PP(dfmax,  DF_SNaN,   DF_any,          DF_any,        USR_FPINVF);
+    TEST_P_OP_PP(dfmax,  DF_any,    DF_QNaN,         DF_any,        USR_CLEAR);
+    TEST_P_OP_PP(dfmax,  DF_QNaN,   DF_any,          DF_any,        USR_CLEAR);
+    TEST_P_OP_PP(dfmax,  DF_SNaN,   DF_QNaN,         DF_HEX_NaN,    USR_FPINVF);
+    TEST_P_OP_PP(dfmax,  DF_QNaN,   DF_SNaN,         DF_HEX_NaN,    USR_FPINVF);
+
+    TEST_XP_OP_PP(dfmpyhh, DF_one,   DF_one,  DF_one,   DF_one_hh,  USR_CLEAR);
+    TEST_XP_OP_PP(dfmpyhh, DF_zero,  DF_any,  DF_QNaN,  DF_HEX_NaN, USR_CLEAR);
+    TEST_XP_OP_PP(dfmpyhh, DF_zero,  DF_any,  DF_SNaN,  DF_HEX_NaN, USR_FPINVF);
+    TEST_XP_OP_PP(dfmpyhh, DF_zero,  DF_QNaN, DF_SNaN,  DF_HEX_NaN, USR_FPINVF);
+    TEST_XP_OP_PP(dfmpyhh, DF_zero,  DF_SNaN, DF_QNaN,  DF_HEX_NaN, USR_FPINVF);
+#else
+    printf("v67 instructions skipped\n");
+#endif
+
+    TEST_CMP_PP(dfcmpuo, DF_small_neg, DF_any,          0x00,    USR_CLEAR);
+    TEST_CMP_PP(dfcmpuo, DF_large_pos, DF_any,          0x00,    USR_CLEAR);
+    TEST_CMP_PP(dfcmpuo, DF_QNaN,      DF_any,          0xff,    USR_CLEAR);
+    TEST_CMP_PP(dfcmpuo, DF_QNaN_neg,  DF_any,          0xff,    USR_CLEAR);
+    TEST_CMP_PP(dfcmpuo, DF_SNaN,      DF_any,          0xff,    USR_FPINVF);
+    TEST_CMP_PP(dfcmpuo, DF_SNaN_neg,  DF_any,          0xff,    USR_FPINVF);
+    TEST_CMP_PP(dfcmpuo, DF_QNaN,      DF_QNaN,         0xff,    USR_CLEAR);
+    TEST_CMP_PP(dfcmpuo, DF_QNaN,      DF_SNaN,         0xff,    USR_FPINVF);
+
+    TEST_CMP_PP(dfcmpeq, DF_any,       DF_QNaN,         0x00,    USR_CLEAR);
+    TEST_CMP_PP(dfcmpeq, DF_any,       DF_SNaN,         0x00,    USR_FPINVF);
+    TEST_CMP_PP(dfcmpgt, DF_any,       DF_QNaN,         0x00,    USR_CLEAR);
+    TEST_CMP_PP(dfcmpgt, DF_any,       DF_SNaN,         0x00,    USR_FPINVF);
+    TEST_CMP_PP(dfcmpge, DF_any,       DF_QNaN,         0x00,    USR_CLEAR);
+    TEST_CMP_PP(dfcmpge, DF_any,       DF_SNaN,         0x00,    USR_FPINVF);
+
+    TEST_P_OP_R(conv_sf2df,       SF_QNaN,  DF_HEX_NaN,             USR_CLEAR);
+    TEST_P_OP_R(conv_sf2df,       SF_SNaN,  DF_HEX_NaN,             USR_FPINVF);
+    TEST_R_OP_R(conv_sf2uw,       SF_QNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_R(conv_sf2uw,       SF_SNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_R(conv_sf2w,        SF_QNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_R(conv_sf2w,        SF_SNaN,  0xffffffff,             USR_FPINVF);
+    TEST_P_OP_R(conv_sf2ud,       SF_QNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_R(conv_sf2ud,       SF_SNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_R(conv_sf2d,        SF_QNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_R(conv_sf2d,        SF_SNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_R_OP_R(conv_sf2uw_chop,  SF_QNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_R(conv_sf2uw_chop,  SF_SNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_R(conv_sf2w_chop,   SF_QNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_R(conv_sf2w_chop,   SF_SNaN,  0xffffffff,             USR_FPINVF);
+    TEST_P_OP_R(conv_sf2ud_chop,  SF_QNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_R(conv_sf2ud_chop,  SF_SNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_R(conv_sf2d_chop,   SF_QNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_R(conv_sf2d_chop,   SF_SNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+
+    TEST_R_OP_P(conv_df2sf,       DF_QNaN,  SF_HEX_NaN,             USR_CLEAR);
+    TEST_R_OP_P(conv_df2sf,       DF_SNaN,  SF_HEX_NaN,             USR_FPINVF);
+    TEST_R_OP_P(conv_df2uw,       DF_QNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_P(conv_df2uw,       DF_SNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_P(conv_df2w,        DF_QNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_P(conv_df2w,        DF_SNaN,  0xffffffff,             USR_FPINVF);
+    TEST_P_OP_P(conv_df2ud,       DF_QNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_P(conv_df2ud,       DF_SNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_P(conv_df2d,        DF_QNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_P(conv_df2d,        DF_SNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_R_OP_P(conv_df2uw_chop,  DF_QNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_P(conv_df2uw_chop,  DF_SNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_P(conv_df2w_chop,   DF_QNaN,  0xffffffff,             USR_FPINVF);
+    TEST_R_OP_P(conv_df2w_chop,   DF_SNaN,  0xffffffff,             USR_FPINVF);
+    TEST_P_OP_P(conv_df2ud_chop,  DF_QNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_P(conv_df2ud_chop,  DF_SNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_P(conv_df2d_chop,   DF_QNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+    TEST_P_OP_P(conv_df2d_chop,   DF_SNaN,  0xffffffffffffffffULL,  USR_FPINVF);
+
+    TEST_R_OP_R(conv_uw2sf,    0x00000001,             SF_one,      USR_CLEAR);
+    TEST_R_OP_R(conv_uw2sf,    0x010020a5,             0x4b801052,  USR_FPINPF);
+    TEST_R_OP_R(conv_w2sf,     0x00000001,             SF_one,      USR_CLEAR);
+    TEST_R_OP_R(conv_w2sf,     0x010020a5,             0x4b801052,  USR_FPINPF);
+    TEST_R_OP_P(conv_ud2sf,    0x0000000000000001ULL,  SF_one,      USR_CLEAR);
+    TEST_R_OP_P(conv_ud2sf,    0x00000000010020a5ULL,  0x4b801052,  USR_FPINPF);
+    TEST_R_OP_P(conv_d2sf,     0x0000000000000001ULL,  SF_one,      USR_CLEAR);
+    TEST_R_OP_P(conv_d2sf,     0x00000000010020a5ULL,  0x4b801052,  USR_FPINPF);
+
+    TEST_XR_OP_RRp(sffma_sc, SF_one,   SF_one,    SF_one,   1, SF_four,
+                   USR_CLEAR);
+    TEST_XR_OP_RRp(sffma_sc, SF_QNaN,  SF_one,    SF_one,   1, SF_HEX_NaN,
+                   USR_CLEAR);
+    TEST_XR_OP_RRp(sffma_sc, SF_one,   SF_QNaN,   SF_one,   1, SF_HEX_NaN,
+                   USR_CLEAR);
+    TEST_XR_OP_RRp(sffma_sc, SF_one,   SF_one,    SF_QNaN,  1, SF_HEX_NaN,
+                   USR_CLEAR);
+    TEST_XR_OP_RRp(sffma_sc, SF_SNaN,  SF_one,    SF_one,   1, SF_HEX_NaN,
+                   USR_FPINVF);
+    TEST_XR_OP_RRp(sffma_sc, SF_one,   SF_SNaN,   SF_one,   1, SF_HEX_NaN,
+                   USR_FPINVF);
+    TEST_XR_OP_RRp(sffma_sc, SF_one,   SF_one,    SF_SNaN,  1, SF_HEX_NaN,
+                   USR_FPINVF);
+
+    TEST_Rp_OP_RR(sfrecipa, SF_one,    SF_one,    SF_one_recip,   0x00,
+                  USR_CLEAR);
+    TEST_Rp_OP_RR(sfrecipa, SF_QNaN,   SF_one,    SF_HEX_NaN,     0x00,
+                  USR_CLEAR);
+    TEST_Rp_OP_RR(sfrecipa, SF_one,    SF_QNaN,   SF_HEX_NaN,     0x00,
+                  USR_CLEAR);
+    TEST_Rp_OP_RR(sfrecipa, SF_one,    SF_SNaN,   SF_HEX_NaN,     0x00,
+                  USR_FPINVF);
+    TEST_Rp_OP_RR(sfrecipa, SF_SNaN,   SF_one,    SF_HEX_NaN,     0x00,
+                  USR_FPINVF);
+
+    TEST_R_OP_RR(sffixupn, SF_one,     SF_one,    SF_one,       USR_CLEAR);
+    TEST_R_OP_RR(sffixupn, SF_QNaN,    SF_one,    SF_HEX_NaN,   USR_CLEAR);
+    TEST_R_OP_RR(sffixupn, SF_one,     SF_QNaN,   SF_HEX_NaN,   USR_CLEAR);
+    TEST_R_OP_RR(sffixupn, SF_SNaN,    SF_one,    SF_HEX_NaN,   USR_FPINVF);
+    TEST_R_OP_RR(sffixupn, SF_one,     SF_SNaN,   SF_HEX_NaN,   USR_FPINVF);
+
+    TEST_R_OP_RR(sffixupd, SF_one,     SF_one,    SF_one,       USR_CLEAR);
+    TEST_R_OP_RR(sffixupd, SF_QNaN,    SF_one,    SF_HEX_NaN,   USR_CLEAR);
+    TEST_R_OP_RR(sffixupd, SF_one,     SF_QNaN,   SF_HEX_NaN,   USR_CLEAR);
+    TEST_R_OP_RR(sffixupd, SF_SNaN,    SF_one,    SF_HEX_NaN,   USR_FPINVF);
+    TEST_R_OP_RR(sffixupd, SF_one,     SF_SNaN,   SF_HEX_NaN,   USR_FPINVF);
+
+    TEST_R_OP_R(sffixupr, SF_one,             SF_one,           USR_CLEAR);
+    TEST_R_OP_R(sffixupr, SF_QNaN,            SF_HEX_NaN,       USR_CLEAR);
+    TEST_R_OP_R(sffixupr, SF_SNaN,            SF_HEX_NaN,       USR_FPINVF);
+
+    TEST_Rp_OP_R(sfinvsqrta, SF_one,        SF_one_invsqrta,  0x00, USR_CLEAR);
+    TEST_Rp_OP_R(sfinvsqrta, SF_zero,       SF_one,           0x00, USR_CLEAR);
+    TEST_Rp_OP_R(sfinvsqrta, SF_QNaN,       SF_HEX_NaN,       0x00, USR_CLEAR);
+    TEST_Rp_OP_R(sfinvsqrta, SF_small_neg,  SF_HEX_NaN,       0x00, USR_FPINVF);
+    TEST_Rp_OP_R(sfinvsqrta, SF_SNaN,       SF_HEX_NaN,       0x00, USR_FPINVF);
+
     puts(err ? "FAIL" : "PASS");
     return err;
 }
-- 
cgit v1.1


From 8576e7ecae056845de6e0bafc547501f2bc6461c Mon Sep 17 00:00:00 2001
From: Taylor Simpson <tsimpson@quicinc.com>
Date: Wed, 9 Feb 2022 18:15:52 -0800
Subject: Hexagon (tests/tcg/hexagon) update overflow test

Add a test that sets USR multiple times in a packet

Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <20220210021556.9217-9-tsimpson@quicinc.com>
Acked-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/tcg/hexagon/overflow.c | 61 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 1 deletion(-)

(limited to 'tests')

diff --git a/tests/tcg/hexagon/overflow.c b/tests/tcg/hexagon/overflow.c
index 196fcf7..9408785 100644
--- a/tests/tcg/hexagon/overflow.c
+++ b/tests/tcg/hexagon/overflow.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2021-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -72,6 +72,20 @@ int read_usr_overflow(void)
     return result & 1;
 }
 
+int get_usr_overflow(int usr)
+{
+    return usr & 1;
+}
+
+int get_usr_fp_invalid(int usr)
+{
+    return (usr >> 1) & 1;
+}
+
+int get_usr_lpcfg(int usr)
+{
+    return (usr >> 8) & 0x3;
+}
 
 jmp_buf jmp_env;
 int usr_overflow;
@@ -82,6 +96,49 @@ static void sig_segv(int sig, siginfo_t *info, void *puc)
     longjmp(jmp_env, 1);
 }
 
+static void test_packet(void)
+{
+    int convres;
+    int satres;
+    int usr;
+
+    asm("r2 = usr\n\t"
+        "r2 = clrbit(r2, #0)\n\t"        /* clear overflow bit */
+        "r2 = clrbit(r2, #1)\n\t"        /* clear FP invalid bit */
+        "usr = r2\n\t"
+        "{\n\t"
+        "    %0 = convert_sf2uw(%3):chop\n\t"
+        "    %1 = satb(%4)\n\t"
+        "}\n\t"
+        "%2 = usr\n\t"
+        : "=r"(convres), "=r"(satres), "=r"(usr)
+        : "r"(0x6a051b86), "r"(0x0410eec0)
+        : "r2", "usr");
+
+    check(convres, 0xffffffff);
+    check(satres, 0x7f);
+    check(get_usr_overflow(usr), 1);
+    check(get_usr_fp_invalid(usr), 1);
+
+    asm("r2 = usr\n\t"
+        "r2 = clrbit(r2, #0)\n\t"        /* clear overflow bit */
+        "usr = r2\n\t"
+        "%2 = r2\n\t"
+        "p3 = sp3loop0(1f, #1)\n\t"
+        "1:\n\t"
+        "{\n\t"
+        "    %0 = satb(%2)\n\t"
+        "}:endloop0\n\t"
+        "%1 = usr\n\t"
+        : "=r"(satres), "=r"(usr)
+        : "r"(0x0410eec0)
+        : "r2", "usr", "p3", "sa0", "lc0");
+
+    check(satres, 0x7f);
+    check(get_usr_overflow(usr), 1);
+    check(get_usr_lpcfg(usr), 2);
+}
+
 int main()
 {
     struct sigaction act;
@@ -102,6 +159,8 @@ int main()
 
     check(usr_overflow, 0);
 
+    test_packet();
+
     puts(err ? "FAIL" : "PASS");
     return err ? EXIT_FAILURE : EXIT_SUCCESS;
 }
-- 
cgit v1.1


From 8af2d9978ad2c52377fe69466a556fffeedcd057 Mon Sep 17 00:00:00 2001
From: Taylor Simpson <tsimpson@quicinc.com>
Date: Wed, 9 Feb 2022 18:15:53 -0800
Subject: Hexagon (tests/tcg/hexagon) fix inline asm in preg_alias.c

Replace consecutive inline asm blocks with a single one with proper
outputs/inputs/clobbers rather than making assumptions about register
values being carried between separate blocks.

Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <20220210021556.9217-10-tsimpson@quicinc.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/tcg/hexagon/preg_alias.c | 46 ++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 24 deletions(-)

(limited to 'tests')

diff --git a/tests/tcg/hexagon/preg_alias.c b/tests/tcg/hexagon/preg_alias.c
index 0cac469..79febec 100644
--- a/tests/tcg/hexagon/preg_alias.c
+++ b/tests/tcg/hexagon/preg_alias.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -57,17 +57,15 @@ typedef union {
 
 static inline void creg_alias(int cval, PRegs *pregs)
 {
-  unsigned char val;
-  asm volatile("c4 = %0" : : "r"(cval));
-
-  asm volatile("%0 = p0" : "=r"(val));
-  pregs->pregs.p0 = val;
-  asm volatile("%0 = p1" : "=r"(val));
-  pregs->pregs.p1 = val;
-  asm volatile("%0 = p2" : "=r"(val));
-  pregs->pregs.p2 = val;
-  asm volatile("%0 = p3" : "=r"(val));
-  pregs->pregs.p3 = val;
+  asm("c4 = %4\n\t"
+      "%0 = p0\n\t"
+      "%1 = p1\n\t"
+      "%2 = p2\n\t"
+      "%3 = p3\n\t"
+      : "=r"(pregs->pregs.p0), "=r"(pregs->pregs.p1),
+        "=r"(pregs->pregs.p2), "=r"(pregs->pregs.p3)
+      : "r"(cval)
+      : "p0", "p1", "p2", "p3");
 }
 
 int err;
@@ -83,19 +81,19 @@ static void check(int val, int expect)
 static inline void creg_alias_pair(unsigned int cval, PRegs *pregs)
 {
   unsigned long long cval_pair = (0xdeadbeefULL << 32) | cval;
-  unsigned char val;
   int c5;
-  asm volatile("c5:4 = %0" : : "r"(cval_pair));
-
-  asm volatile("%0 = p0" : "=r"(val));
-  pregs->pregs.p0 = val;
-  asm volatile("%0 = p1" : "=r"(val));
-  pregs->pregs.p1 = val;
-  asm volatile("%0 = p2" : "=r"(val));
-  pregs->pregs.p2 = val;
-  asm volatile("%0 = p3" : "=r"(val));
-  pregs->pregs.p3 = val;
-  asm volatile("%0 = c5" : "=r"(c5));
+
+  asm ("c5:4 = %5\n\t"
+       "%0 = p0\n\t"
+       "%1 = p1\n\t"
+       "%2 = p2\n\t"
+       "%3 = p3\n\t"
+       "%4 = c5\n\t"
+       : "=r"(pregs->pregs.p0), "=r"(pregs->pregs.p1),
+         "=r"(pregs->pregs.p2), "=r"(pregs->pregs.p3), "=r"(c5)
+       : "r"(cval_pair)
+       : "p0", "p1", "p2", "p3");
+
   check(c5, 0xdeadbeef);
 }
 
-- 
cgit v1.1


From 3977ba3078503fca0f182aa8a39fad2388f43cb1 Mon Sep 17 00:00:00 2001
From: Taylor Simpson <tsimpson@quicinc.com>
Date: Wed, 9 Feb 2022 18:15:54 -0800
Subject: Hexagon (target/hexagon) fix bug in conv_df2uw_chop

Fix typo that checked for 32 bit nan instead of 64 bit

Test case added in tests/tcg/hexagon/usr.c

Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <20220210021556.9217-11-tsimpson@quicinc.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/tcg/hexagon/usr.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tests')

diff --git a/tests/tcg/hexagon/usr.c b/tests/tcg/hexagon/usr.c
index 11415f8..a531511 100644
--- a/tests/tcg/hexagon/usr.c
+++ b/tests/tcg/hexagon/usr.c
@@ -1068,6 +1068,10 @@ int main()
     TEST_P_OP_P(conv_df2d,        DF_SNaN,  0xffffffffffffffffULL,  USR_FPINVF);
     TEST_R_OP_P(conv_df2uw_chop,  DF_QNaN,  0xffffffff,             USR_FPINVF);
     TEST_R_OP_P(conv_df2uw_chop,  DF_SNaN,  0xffffffff,             USR_FPINVF);
+
+    /* Test for typo in HELPER(conv_df2uw_chop) */
+    TEST_R_OP_P(conv_df2uw_chop, 0xffffff7f00000001ULL, 0xffffffff, USR_FPINVF);
+
     TEST_R_OP_P(conv_df2w_chop,   DF_QNaN,  0xffffffff,             USR_FPINVF);
     TEST_R_OP_P(conv_df2w_chop,   DF_SNaN,  0xffffffff,             USR_FPINVF);
     TEST_P_OP_P(conv_df2ud_chop,  DF_QNaN,  0xffffffffffffffffULL,  USR_FPINVF);
-- 
cgit v1.1


From c0d86060f033fc8d591b0163e380ff6cd04f213a Mon Sep 17 00:00:00 2001
From: Taylor Simpson <tsimpson@quicinc.com>
Date: Wed, 9 Feb 2022 18:15:56 -0800
Subject: Hexagon (target/hexagon) assignment to c4 should wait until packet
 commit

On Hexagon, c4 is an alias for predicate registers P3:0.  If we assign to
c4 inside a packet with reads from predicate registers, the predicate
reads should get the old values.

Test case added to tests/tcg/hexagon/preg_alias.c

Co-authored-by: Michael Lambert <mlambert@cuicinc.com>
Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <20220210021556.9217-13-tsimpson@quicinc.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/tcg/hexagon/preg_alias.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'tests')

diff --git a/tests/tcg/hexagon/preg_alias.c b/tests/tcg/hexagon/preg_alias.c
index 79febec..b44a811 100644
--- a/tests/tcg/hexagon/preg_alias.c
+++ b/tests/tcg/hexagon/preg_alias.c
@@ -97,6 +97,42 @@ static inline void creg_alias_pair(unsigned int cval, PRegs *pregs)
   check(c5, 0xdeadbeef);
 }
 
+static void test_packet(void)
+{
+    /*
+     * Test that setting c4 inside a packet doesn't impact the predicates
+     * that are read during the packet.
+     */
+
+    int result;
+    int old_val = 0x0000001c;
+
+    /* Test a predicated register transfer */
+    result = old_val;
+    asm (
+         "c4 = %1\n\t"
+         "{\n\t"
+         "    c4 = %2\n\t"
+         "    if (!p2) %0 = %3\n\t"
+         "}\n\t"
+         : "+r"(result)
+         : "r"(0xffffffff), "r"(0xff00ffff), "r"(0x837ed653)
+         : "p0", "p1", "p2", "p3");
+    check(result, old_val);
+
+    /* Test a predicated store */
+    result = 0xffffffff;
+    asm ("c4 = %0\n\t"
+         "{\n\t"
+         "    c4 = %1\n\t"
+         "    if (!p2) memw(%2) = #0\n\t"
+         "}\n\t"
+         :
+         : "r"(0), "r"(0xffffffff), "r"(&result)
+         : "p0", "p1", "p2", "p3", "memory");
+    check(result, 0x0);
+}
+
 int main()
 {
     int c4;
@@ -162,6 +198,8 @@ int main()
     creg_alias_pair(0xffffffff, &pregs);
     check(pregs.creg, 0xffffffff);
 
+    test_packet();
+
     puts(err ? "FAIL" : "PASS");
     return err;
 }
-- 
cgit v1.1