aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Jenner <andrew@codesourcery.com>2023-03-22 11:20:40 +0000
committerAndrew Jenner <andrew@codesourcery.com>2023-03-22 11:20:40 +0000
commit4ac218a3f08bb6bfef3a0bea13cf81da8479a722 (patch)
tree7c72195a277b0b1176b76dd83ef875108ddb7069
parent9d0ea7a55ac815c59dd3b004008aa2195c541d12 (diff)
downloadgcc-4ac218a3f08bb6bfef3a0bea13cf81da8479a722.zip
gcc-4ac218a3f08bb6bfef3a0bea13cf81da8479a722.tar.gz
gcc-4ac218a3f08bb6bfef3a0bea13cf81da8479a722.tar.bz2
amdgcn: Add instruction patterns for complex number operations.
gcc/ChangeLog: * config/gcn/gcn-protos.h (gcn_expand_dpp_swap_pairs_insn) (gcn_expand_dpp_distribute_even_insn) (gcn_expand_dpp_distribute_odd_insn): Declare. * config/gcn/gcn-valu.md (@dpp_swap_pairs<mode>) (@dpp_distribute_even<mode>, @dpp_distribute_odd<mode>) (cmul<conj_op><mode>3, cml<addsub_as><mode>4, vec_addsub<mode>3) (cadd<rot><mode>3, vec_fmaddsub<mode>4, vec_fmsubadd<mode>4) (fms<mode>4<exec>, fms<mode>4_negop2<exec>, fms<mode>4) (fms<mode>4_negop2): New patterns. * config/gcn/gcn.cc (gcn_expand_dpp_swap_pairs_insn) (gcn_expand_dpp_distribute_even_insn) (gcn_expand_dpp_distribute_odd_insn): New functions. * config/gcn/gcn.md: Add entries to unspec enum. gcc/testsuite/ChangeLog: * gcc.target/gcn/complex.c: New test.
-rw-r--r--gcc/ChangeLog.omp16
-rw-r--r--gcc/config/gcn/gcn-protos.h5
-rw-r--r--gcc/config/gcn/gcn-valu.md263
-rw-r--r--gcc/config/gcn/gcn.cc73
-rw-r--r--gcc/config/gcn/gcn.md7
-rw-r--r--gcc/testsuite/ChangeLog.omp4
-rwxr-xr-xgcc/testsuite/gcc.target/gcn/complex.c627
7 files changed, 995 insertions, 0 deletions
diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp
index c724308..86b1735 100644
--- a/gcc/ChangeLog.omp
+++ b/gcc/ChangeLog.omp
@@ -1,3 +1,19 @@
+2023-03-22 Andrew Jenner <andrew@codesourcery.com>
+
+ * config/gcn/gcn-protos.h (gcn_expand_dpp_swap_pairs_insn)
+ (gcn_expand_dpp_distribute_even_insn)
+ (gcn_expand_dpp_distribute_odd_insn): Declare.
+ * config/gcn/gcn-valu.md (@dpp_swap_pairs<mode>)
+ (@dpp_distribute_even<mode>, @dpp_distribute_odd<mode>)
+ (cmul<conj_op><mode>3, cml<addsub_as><mode>4, vec_addsub<mode>3)
+ (cadd<rot><mode>3, vec_fmaddsub<mode>4, vec_fmsubadd<mode>4)
+ (fms<mode>4<exec>, fms<mode>4_negop2<exec>, fms<mode>4)
+ (fms<mode>4_negop2): New patterns.
+ * config/gcn/gcn.cc (gcn_expand_dpp_swap_pairs_insn)
+ (gcn_expand_dpp_distribute_even_insn)
+ (gcn_expand_dpp_distribute_odd_insn): New functions.
+ * config/gcn/gcn.md: Add entries to unspec enum.
+
2023-03-17 Andrew Stubbs <ams@codesourcery.com>
* config/gcn/gcn.cc (gcn_class_max_nregs): Handle vectors in SGPRs.
diff --git a/gcc/config/gcn/gcn-protos.h b/gcc/config/gcn/gcn-protos.h
index 3625a58..ec1247f 100644
--- a/gcc/config/gcn/gcn-protos.h
+++ b/gcc/config/gcn/gcn-protos.h
@@ -27,6 +27,11 @@ extern unsigned int gcn_dwarf_register_number (unsigned int regno);
extern rtx get_exec (int64_t);
extern rtx get_exec (machine_mode mode);
extern char * gcn_expand_dpp_shr_insn (machine_mode, const char *, int, int);
+extern char * gcn_expand_dpp_swap_pairs_insn (machine_mode, const char *, int);
+extern char * gcn_expand_dpp_distribute_even_insn (machine_mode, const char *,
+ int unspec);
+extern char * gcn_expand_dpp_distribute_odd_insn (machine_mode, const char *,
+ int unspec);
extern void gcn_expand_epilogue ();
extern rtx gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets,
rtx scale, bool unsigned_p, rtx exec);
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index 70e3fa6..44d1071 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -1326,6 +1326,45 @@
[(set_attr "type" "vop_dpp")
(set_attr "length" "16")])
+(define_insn "@dpp_swap_pairs<mode>"
+ [(set (match_operand:V_noHI 0 "register_operand" "=v")
+ (unspec:V_noHI
+ [(match_operand:V_noHI 1 "register_operand" " v")]
+ UNSPEC_MOV_DPP_SWAP_PAIRS))]
+ ""
+ {
+ return gcn_expand_dpp_swap_pairs_insn (<MODE>mode, "v_mov_b32",
+ UNSPEC_MOV_DPP_SWAP_PAIRS);
+ }
+ [(set_attr "type" "vop_dpp")
+ (set_attr "length" "16")])
+
+(define_insn "@dpp_distribute_even<mode>"
+ [(set (match_operand:V_noHI 0 "register_operand" "=v")
+ (unspec:V_noHI
+ [(match_operand:V_noHI 1 "register_operand" " v")]
+ UNSPEC_MOV_DPP_DISTRIBUTE_EVEN))]
+ ""
+ {
+ return gcn_expand_dpp_distribute_even_insn (<MODE>mode, "v_mov_b32",
+ UNSPEC_MOV_DPP_DISTRIBUTE_EVEN);
+ }
+ [(set_attr "type" "vop_dpp")
+ (set_attr "length" "16")])
+
+(define_insn "@dpp_distribute_odd<mode>"
+ [(set (match_operand:V_noHI 0 "register_operand" "=v")
+ (unspec:V_noHI
+ [(match_operand:V_noHI 1 "register_operand" " v")]
+ UNSPEC_MOV_DPP_DISTRIBUTE_EVEN))]
+ ""
+ {
+ return gcn_expand_dpp_distribute_odd_insn (<MODE>mode, "v_mov_b32",
+ UNSPEC_MOV_DPP_DISTRIBUTE_ODD);
+ }
+ [(set_attr "type" "vop_dpp")
+ (set_attr "length" "16")])
+
;; }}}
;; {{{ ALU special case: add/sub
@@ -2287,6 +2326,180 @@
DONE;
})
+(define_int_iterator UNSPEC_CMUL_OP [UNSPEC_CMUL UNSPEC_CMUL_CONJ])
+(define_int_attr conj_op [(UNSPEC_CMUL "") (UNSPEC_CMUL_CONJ "_conj")])
+(define_int_attr cmul_subadd [(UNSPEC_CMUL "sub") (UNSPEC_CMUL_CONJ "add")])
+(define_int_attr cmul_addsub [(UNSPEC_CMUL "add") (UNSPEC_CMUL_CONJ "sub")])
+
+(define_expand "cmul<conj_op><mode>3"
+ [(set (match_operand:V_noHI 0 "register_operand" "=&v")
+ (unspec:V_noHI
+ [(match_operand:V_noHI 1 "register_operand" "v")
+ (match_operand:V_noHI 2 "register_operand" "v")]
+ UNSPEC_CMUL_OP))]
+ ""
+ {
+ // operands[1] a b
+ // operands[2] c d
+ rtx t1 = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_mul<mode>3 (t1, operands[1], operands[2])); // a*c b*d
+
+ rtx s2_perm = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_dpp_swap_pairs<mode> (s2_perm, operands[2])); // d c
+
+ rtx t2 = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_mul<mode>3 (t2, operands[1], s2_perm)); // a*d b*c
+
+ rtx t1_perm = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_dpp_swap_pairs<mode> (t1_perm, t1)); // b*d a*c
+
+ rtx even = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (even, get_exec (0x5555555555555555UL));
+ rtx dest = operands[0];
+ emit_insn (gen_<cmul_subadd><mode>3_exec (dest, t1, t1_perm, dest, even));
+ // a*c-b*d 0
+
+ rtx t2_perm = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_dpp_swap_pairs<mode> (t2_perm, t2)); // b*c a*d
+
+ rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+ emit_insn (gen_<cmul_addsub><mode>3_exec (dest, t2, t2_perm, dest, odd));
+ // 0 a*d+b*c
+ DONE;
+ })
+
+(define_code_iterator addsub [plus minus])
+(define_code_attr addsub_as [(plus "a") (minus "s")])
+
+(define_expand "cml<addsub_as><mode>4"
+ [(set (match_operand:V_FP 0 "register_operand" "=&v")
+ (addsub:V_FP
+ (unspec:V_FP
+ [(match_operand:V_FP 1 "register_operand" "v")
+ (match_operand:V_FP 2 "register_operand" "v")]
+ UNSPEC_CMUL)
+ (match_operand:V_FP 3 "register_operand" "v")))]
+ ""
+ {
+ rtx a = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_dpp_distribute_even<mode> (a, operands[1])); // a a
+
+ rtx t1 = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_fm<addsub_as><mode>4 (t1, a, operands[2], operands[3]));
+ // a*c a*d
+
+ rtx b = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_dpp_distribute_odd<mode> (b, operands[1])); // b b
+
+ rtx t2 = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_mul<mode>3 (t2, b, operands[2])); // b*c b*d
+
+ rtx t2_perm = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_dpp_swap_pairs<mode> (t2_perm, t2)); // b*d b*c
+
+ rtx even = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (even, get_exec (0x5555555555555555UL));
+ rtx dest = operands[0];
+ emit_insn (gen_sub<mode>3_exec (dest, t1, t2_perm, dest, even));
+
+ rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+ emit_insn (gen_add<mode>3_exec (dest, t1, t2_perm, dest, odd));
+
+ DONE;
+ })
+
+(define_expand "vec_addsub<mode>3"
+ [(set (match_operand:V_noHI 0 "register_operand" "=&v")
+ (vec_merge:V_noHI
+ (minus:V_noHI
+ (match_operand:V_noHI 1 "register_operand" "v")
+ (match_operand:V_noHI 2 "register_operand" "v"))
+ (plus:V_noHI (match_dup 1) (match_dup 2))
+ (const_int 6148914691236517205)))]
+ ""
+ {
+ rtx even = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (even, get_exec (0x5555555555555555UL));
+ rtx dest = operands[0];
+ rtx x = operands[1];
+ rtx y = operands[2];
+ emit_insn (gen_sub<mode>3_exec (dest, x, y, dest, even));
+ rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+ emit_insn (gen_add<mode>3_exec (dest, x, y, dest, odd));
+
+ DONE;
+ })
+
+(define_int_iterator CADD [UNSPEC_CADD90 UNSPEC_CADD270])
+(define_int_attr rot [(UNSPEC_CADD90 "90") (UNSPEC_CADD270 "270")])
+(define_int_attr cadd_subadd [(UNSPEC_CADD90 "sub") (UNSPEC_CADD270 "add")])
+(define_int_attr cadd_addsub [(UNSPEC_CADD90 "add") (UNSPEC_CADD270 "sub")])
+
+(define_expand "cadd<rot><mode>3"
+ [(set (match_operand:V_noHI 0 "register_operand" "=&v")
+ (unspec:V_noHI [(match_operand:V_noHI 1 "register_operand" "v")
+ (match_operand:V_noHI 2 "register_operand" "v")]
+ CADD))]
+ ""
+ {
+ rtx dest = operands[0];
+ rtx x = operands[1];
+ rtx y = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_dpp_swap_pairs<mode> (y, operands[2]));
+
+ rtx even = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (even, get_exec (0x5555555555555555UL));
+ emit_insn (gen_<cadd_subadd><mode>3_exec (dest, x, y, dest, even));
+ rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+ emit_insn (gen_<cadd_addsub><mode>3_exec (dest, x, y, dest, odd));
+
+ DONE;
+ })
+
+(define_expand "vec_fmaddsub<mode>4"
+ [(match_operand:V_noHI 0 "register_operand" "=&v")
+ (match_operand:V_noHI 1 "register_operand" "v")
+ (match_operand:V_noHI 2 "register_operand" "v")
+ (match_operand:V_noHI 3 "register_operand" "v")]
+ ""
+ {
+ rtx t1 = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_mul<mode>3 (t1, operands[1], operands[2]));
+ rtx even = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (even, get_exec (0x5555555555555555UL));
+ rtx dest = operands[0];
+ emit_insn (gen_sub<mode>3_exec (dest, t1, operands[3], dest, even));
+ rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+ emit_insn (gen_add<mode>3_exec (dest, t1, operands[3], dest, odd));
+
+ DONE;
+ })
+
+(define_expand "vec_fmsubadd<mode>4"
+ [(match_operand:V_noHI 0 "register_operand" "=&v")
+ (match_operand:V_noHI 1 "register_operand" "v")
+ (match_operand:V_noHI 2 "register_operand" "v")
+ (match_operand:V_noHI 3 "register_operand" "v")]
+ ""
+ {
+ rtx t1 = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_mul<mode>3 (t1, operands[1], operands[2]));
+ rtx even = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (even, get_exec (0x5555555555555555UL));
+ rtx dest = operands[0];
+ emit_insn (gen_add<mode>3_exec (dest, t1, operands[3], dest, even));
+ rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+ emit_insn (gen_add<mode>3_exec (dest, t1, operands[3], dest, odd));
+
+ DONE;
+ })
+
;; }}}
;; {{{ ALU generic case
@@ -2963,6 +3176,56 @@
[(set_attr "type" "vop3a")
(set_attr "length" "8")])
+(define_insn "fms<mode>4<exec>"
+ [(set (match_operand:V_FP 0 "register_operand" "= v, v")
+ (fma:V_FP
+ (match_operand:V_FP 1 "gcn_alu_operand" "% vA, vA")
+ (match_operand:V_FP 2 "gcn_alu_operand" " vA,vSvA")
+ (neg:V_FP
+ (match_operand:V_FP 3 "gcn_alu_operand" "vSvA, vA"))))]
+ ""
+ "v_fma%i0\t%0, %1, %2, -%3"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "fms<mode>4_negop2<exec>"
+ [(set (match_operand:V_FP 0 "register_operand" "= v, v, v")
+ (fma:V_FP
+ (match_operand:V_FP 1 "gcn_alu_operand" " vA, vA,vSvA")
+ (neg:V_FP
+ (match_operand:V_FP 2 "gcn_alu_operand" " vA,vSvA, vA"))
+ (neg:V_FP
+ (match_operand:V_FP 3 "gcn_alu_operand" "vSvA, vA, vA"))))]
+ ""
+ "v_fma%i0\t%0, %1, -%2, -%3"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "fms<mode>4"
+ [(set (match_operand:FP 0 "register_operand" "= v, v")
+ (fma:FP
+ (match_operand:FP 1 "gcn_alu_operand" "% vA, vA")
+ (match_operand:FP 2 "gcn_alu_operand" " vA,vSvA")
+ (neg:FP
+ (match_operand:FP 3 "gcn_alu_operand" "vSvA, vA"))))]
+ ""
+ "v_fma%i0\t%0, %1, %2, -%3"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "fms<mode>4_negop2"
+ [(set (match_operand:FP 0 "register_operand" "= v, v, v")
+ (fma:FP
+ (match_operand:FP 1 "gcn_alu_operand" " vA, vA,vSvA")
+ (neg:FP
+ (match_operand:FP 2 "gcn_alu_operand" " vA,vSvA, vA"))
+ (neg:FP
+ (match_operand:FP 3 "gcn_alu_operand" "vSvA, vA, vA"))))]
+ ""
+ "v_fma%i0\t%0, %1, -%2, -%3"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
;; }}}
;; {{{ FP division
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index 2f48405..10f7405 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -5087,6 +5087,79 @@ gcn_vector_alignment_reachable (const_tree ARG_UNUSED (type), bool is_packed)
return !is_packed;
}
+/* Generate DPP pairwise swap instruction.
+ This instruction swaps the values in each even lane with the value in the
+ next one:
+ a, b, c, d -> b, a, d, c.
+ The opcode is given by INSN. */
+
+char *
+gcn_expand_dpp_swap_pairs_insn (machine_mode mode, const char *insn,
+ int ARG_UNUSED (unspec))
+{
+ static char buf[128];
+ const char *dpp;
+
+ /* Add the DPP modifiers. */
+ dpp = "quad_perm:[1,0,3,2]";
+
+ if (vgpr_2reg_mode_p (mode))
+ sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
+ insn, dpp, insn, dpp);
+ else
+ sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
+
+ return buf;
+}
+
+/* Generate DPP distribute even instruction.
+ This instruction copies the value in each even lane to the next one:
+ a, b, c, d -> a, a, c, c.
+ The opcode is given by INSN. */
+
+char *
+gcn_expand_dpp_distribute_even_insn (machine_mode mode, const char *insn,
+ int ARG_UNUSED (unspec))
+{
+ static char buf[128];
+ const char *dpp;
+
+ /* Add the DPP modifiers. */
+ dpp = "quad_perm:[0,0,2,2]";
+
+ if (vgpr_2reg_mode_p (mode))
+ sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
+ insn, dpp, insn, dpp);
+ else
+ sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
+
+ return buf;
+}
+
+/* Generate DPP distribute odd instruction.
+ This isntruction copies the value in each odd lane to the previous one:
+ a, b, c, d -> b, b, d, d.
+ The opcode is given by INSN. */
+
+char *
+gcn_expand_dpp_distribute_odd_insn (machine_mode mode, const char *insn,
+ int ARG_UNUSED (unspec))
+{
+ static char buf[128];
+ const char *dpp;
+
+ /* Add the DPP modifiers. */
+ dpp = "quad_perm:[1,1,3,3]";
+
+ if (vgpr_2reg_mode_p (mode))
+ sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
+ insn, dpp, insn, dpp);
+ else
+ sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
+
+ return buf;
+}
+
/* Generate DPP instructions used for vector reductions.
The opcode is given by INSN.
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index 7e803f7..941746f 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -78,6 +78,13 @@
UNSPEC_PLUS_CARRY_DPP_SHR UNSPEC_PLUS_CARRY_IN_DPP_SHR
UNSPEC_AND_DPP_SHR UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR
UNSPEC_MOV_DPP_SHR
+ UNSPEC_MOV_DPP_SWAP_PAIRS
+ UNSPEC_MOV_DPP_DISTRIBUTE_EVEN
+ UNSPEC_MOV_DPP_DISTRIBUTE_ODD
+ UNSPEC_CMUL UNSPEC_CMUL_CONJ
+ UNSPEC_CMUL_ADD UNSPEC_CMUL_SUB
+ UNSPEC_CADD90
+ UNSPEC_CADD270
UNSPEC_GATHER
UNSPEC_SCATTER
UNSPEC_RCP
diff --git a/gcc/testsuite/ChangeLog.omp b/gcc/testsuite/ChangeLog.omp
index 19357b0..1ed7270 100644
--- a/gcc/testsuite/ChangeLog.omp
+++ b/gcc/testsuite/ChangeLog.omp
@@ -1,3 +1,7 @@
+2023-03-22 Andrew Jenner <andrew@codesourcery.com>
+
+ * gcc.target/gcn/complex.c: New test.
+
2023-03-17 Andrew Stubbs <ams@codesourcery.com>
Backport from mainline:
diff --git a/gcc/testsuite/gcc.target/gcn/complex.c b/gcc/testsuite/gcc.target/gcn/complex.c
new file mode 100755
index 0000000..2304b98
--- /dev/null
+++ b/gcc/testsuite/gcc.target/gcn/complex.c
@@ -0,0 +1,627 @@
+// { dg-do run }
+// { dg-options "-std=c99 -O3" }
+
+#include <stdlib.h>
+#include <stdbool.h>
+
+#define COUNT 1000
+#define MAX 1000
+#define ALIGNMENT (2*1024*1024) // 2MB
+
+_Complex double conj(_Complex double);
+_Complex float conjf(_Complex float);
+
+unsigned int device = 0;
+
+// cmul
+
+void cmulF(float *td, float *te, float *tf, float *tg, int tas)
+{
+ typedef _Complex float complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * f[i];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cmulFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ if (td[i*2] != a*c-b*d || td[i*2+1] != a*d+b*c)
+ return false;
+ }
+ return true;
+}
+
+void cmulD(double *td, double *te, double *tf, double *tg, int tas)
+{
+ typedef _Complex double complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * f[i];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cmulDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ if (td[i*2] != a*c-b*d || td[i*2+1] != a*d+b*c)
+ return false;
+ }
+ return true;
+}
+
+
+// cmul_conj
+
+void cmul_conjF(float *td, float *te, float *tf, float *tg, int tas)
+{
+ typedef _Complex float complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * conj(f[i]);
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cmul_conjFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ if (td[i*2] != a*c+b*d || td[i*2+1] != b*c-a*d)
+ return false;
+ }
+ return true;
+}
+
+void cmul_conjD(double *td, double *te, double *tf, double *tg, int tas)
+{
+ typedef _Complex double complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * conj(f[i]);
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cmul_conjDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ if (td[i*2] != a*c+b*d || td[i*2+1] != b*c-a*d)
+ return false;
+ }
+ return true;
+}
+
+
+// addsub
+
+void addsubF(float *td, float *te, float *tf, float *tg, int tas)
+{
+ typedef _Complex float complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] - conjf(f[i]);
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool addsubFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ if (td[i*2] != a-c || td[i*2+1] != b+d)
+ return false;
+ }
+ return true;
+}
+
+void addsubD(double *td, double *te, double *tf, double *tg, int tas)
+{
+ typedef _Complex double complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] - conj(f[i]);
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool addsubDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ if (td[i*2] != a-c || td[i*2+1] != b+d)
+ return false;
+ }
+ return true;
+}
+
+
+// fmaddsub
+
+void fmaddsubF(float *td, float *te, float *tf, float *tg, int tas)
+{
+ int array_size = tas/2;
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2]*tf[i*2]-tg[i*2];
+ td[i*2+1] = te[i*2+1]*tf[i*2+1]+tg[i*2+1];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool fmaddsubFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ float e = tg[i*2];
+ float f = tg[i*2+1];
+ if (td[i*2] != a*c-e || td[i*2+1] != b*d+f)
+ return false;
+ }
+ return true;
+}
+
+void fmaddsubD(double *td, double *te, double *tf, double *tg, int tas)
+{
+ int array_size = tas/2;
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2]*tf[i*2]-tg[i*2];
+ td[i*2+1] = te[i*2+1]*tf[i*2+1]+tg[i*2+1];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool fmaddsubDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ double e = tg[i*2];
+ double f = tg[i*2+1];
+ if (td[i*2] != a*c-e || td[i*2+1] != b*d+f)
+ return false;
+ }
+ return true;
+}
+
+
+// fmsubadd
+
+void fmsubaddF(float *td, float *te, float *tf, float *tg, int tas)
+{
+ int array_size = tas/2;
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2]*tf[i*2]+tg[i*2];
+ td[i*2+1] = te[i*2+1]*tf[i*2+1]-tg[i*2+1];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool fmsubaddFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ float e = tg[i*2];
+ float f = tg[i*2+1];
+ if (td[i*2] != a*c+e || td[i*2+1] != b*d-f)
+ return false;
+ }
+ return true;
+}
+
+void fmsubaddD(double *td, double *te, double *tf, double *tg, int tas)
+{
+ int array_size = tas/2;
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2]*tf[i*2]+tg[i*2];
+ td[i*2+1] = te[i*2+1]*tf[i*2+1]-tg[i*2+1];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool fmsubaddDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ double e = tg[i*2];
+ double f = tg[i*2+1];
+ if (td[i*2] != a*c+e || td[i*2+1] != b*d-f)
+ return false;
+ }
+ return true;
+}
+
+
+// cadd90
+
+void cadd90F(float *td, float *te, float *tf, float *tg, int tas)
+{
+ int array_size = tas/2;
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2] - tf[i*2+1];
+ td[i*2+1] = te[i*2+1] + tf[i*2];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cadd90Fcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ if (td[i*2] != a-d || td[i*2+1] != b+c)
+ return false;
+ }
+ return true;
+}
+
+void cadd90D(double *td, double *te, double *tf, double *tg, int tas)
+{
+ int array_size = tas/2;
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2] - tf[i*2+1];
+ td[i*2+1] = te[i*2+1] + tf[i*2];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cadd90Dcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ if (td[i*2] != a-d || td[i*2+1] != b+c)
+ return false;
+ }
+ return true;
+}
+
+// cadd270
+
+void cadd270F(float *td, float *te, float *tf, float *tg, int tas)
+{
+ int array_size = tas/2;
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2] + tf[i*2+1];
+ td[i*2+1] = te[i*2+1] - tf[i*2];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cadd270Fcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ if (td[i*2] != a+d || td[i*2+1] != b-c)
+ return false;
+ }
+ return true;
+}
+
+void cadd270D(double *td, double *te, double *tf, double *tg, int tas)
+{
+ int array_size = tas/2;
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2] + tf[i*2+1];
+ td[i*2+1] = te[i*2+1] - tf[i*2];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cadd270Dcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ if (td[i*2] != a+d || td[i*2+1] != b-c)
+ return false;
+ }
+ return true;
+}
+
+
+// cmla
+
+void cmlaF(float *td, float *te, float *tf, float *tg, int tas)
+{
+ typedef _Complex float complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ complexT *g = (complexT*)(tg);
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * f[i] + g[i];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cmlaFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ float e = tg[i*2];
+ float f = tg[i*2+1];
+ if (td[i*2] != a*c-b*d+e || td[i*2+1] != a*d+b*c+f)
+ return false;
+ }
+ return true;
+}
+
+void cmlaD(double *td, double *te, double *tf, double *tg, int tas)
+{
+ typedef _Complex double complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ complexT *g = (complexT*)(tg);
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * f[i] + g[i];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cmlaDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ double e = tg[i*2];
+ double f = tg[i*2+1];
+ if (td[i*2] != a*c-b*d+e || td[i*2+1] != a*d+b*c+f)
+ return false;
+ }
+ return true;
+}
+
+
+// cmls
+
+void cmlsF(float *td, float *te, float *tf, float *tg, int tas)
+{
+ typedef _Complex float complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ complexT *g = (complexT*)(tg);
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * f[i] - g[i];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cmlsFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ float e = tg[i*2];
+ float f = tg[i*2+1];
+ if (td[i*2] != a*c-b*d-e || td[i*2+1] != a*d+b*c-f)
+ return false;
+ }
+ return true;
+}
+
+void cmlsD(double *td, double *te, double *tf, double *tg, int tas)
+{
+ typedef _Complex double complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ complexT *g = (complexT*)(tg);
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * f[i] - g[i];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cmlsDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ double e = tg[i*2];
+ double f = tg[i*2+1];
+ if (td[i*2] != a*c-b*d-e || td[i*2+1] != a*d+b*c-f)
+ return false;
+ }
+ return true;
+}
+
+
+typedef void(*runF)(float *td, float *te, float *tf, float *tg, int tas);
+typedef void(*runD)(double *td, double *te, double *tf, double *tg, int tas);
+typedef bool(*checkF)(float *td, float *te, float *tf, float *tg, int tas);
+typedef bool(*checkD)(double *td, double *te, double *tf, double *tg, int tas);
+
+typedef struct
+{
+ runF rF;
+ runD rD;
+ checkF cF;
+ checkD cD;
+} operation;
+
+operation ops[] = {
+ {cmulF, cmulD, cmulFcheck, cmulDcheck},
+ {cmul_conjF, cmul_conjD, cmul_conjFcheck, cmul_conjDcheck},
+ {addsubF, addsubD, addsubFcheck, addsubDcheck},
+ {fmaddsubF, fmaddsubD, fmaddsubFcheck, fmaddsubDcheck},
+ {fmsubaddF, fmsubaddD, fmsubaddFcheck, fmsubaddDcheck},
+ {cadd90F, cadd90D, cadd90Fcheck, cadd90Dcheck},
+ {cadd270F, cadd270D, cadd270Fcheck, cadd270Dcheck},
+ {cmlaF, cmlaD, cmlaFcheck, cmlaDcheck},
+ {cmlsF, cmlsD, cmlsFcheck, cmlsDcheck}
+};
+
+void testF(operation* op)
+{
+ float* td;
+ float* te;
+ float* tf;
+ float* tg;
+ int array_size = COUNT;
+ td = (float*)malloc(sizeof(float)*array_size);
+ te = (float*)malloc(sizeof(float)*array_size);
+ tf = (float*)malloc(sizeof(float)*array_size);
+ tg = (float*)malloc(sizeof(float)*array_size);
+ float* dd = td;
+ float* ee = te;
+ float* ff = tf;
+ float* gg = tg;
+ for (int i = 0; i < COUNT; ++i)
+ {
+ te[i] = (float)(rand() % MAX);
+ tf[i] = (float)(rand() % MAX);
+ tg[i] = (float)(rand() % MAX);
+ }
+ op->rF(td, te, tf, tg, COUNT);
+ if (!op->cF(td, te, tf, tg, COUNT))
+ abort();
+}
+
+void testD(operation* op)
+{
+ double* td;
+ double* te;
+ double* tf;
+ double* tg;
+ int array_size = COUNT;
+ td = (double*)malloc(sizeof(double)*array_size);
+ te = (double*)malloc(sizeof(double)*array_size);
+ tf = (double*)malloc(sizeof(double)*array_size);
+ tg = (double*)malloc(sizeof(double)*array_size);
+ double* dd = td;
+ double* ee = te;
+ double* ff = tf;
+ double* gg = tg;
+ for (int i = 0; i < COUNT; ++i)
+ {
+ te[i] = (double)(rand() % MAX);
+ tf[i] = (double)(rand() % MAX);
+ tg[i] = (double)(rand() % MAX);
+ }
+ op->rD(td, te, tf, tg, COUNT);
+ if (!op->cD(td, te, tf, tg, COUNT))
+ abort();
+}
+
+int main()
+{
+ for (int i = 0; i < 9; ++i)
+ {
+ testF(&ops[i]);
+ testD(&ops[i]);
+ }
+}
+