aboutsummaryrefslogtreecommitdiff
path: root/gcc/config/i386
diff options
context:
space:
mode:
authorUros Bizjak <ubizjak@gmail.com>2022-01-02 21:12:10 +0100
committerUros Bizjak <ubizjak@gmail.com>2022-01-02 21:13:14 +0100
commit9ff206d3865df5cb8407490aa9481029beac087f (patch)
treefb8a6fee872e0a4ddc63c25ab4dd90ee7983428e /gcc/config/i386
parent6bec6e3aaa306ca7b87d6e6654acca546fa25e90 (diff)
downloadgcc-9ff206d3865df5cb8407490aa9481029beac087f.zip
gcc-9ff206d3865df5cb8407490aa9481029beac087f.tar.gz
gcc-9ff206d3865df5cb8407490aa9481029beac087f.tar.bz2
i386: Introduce V2QImode vectorized arithmetic [PR103861]
This patch adds basic V2QImode infrastructure and V2QImode arithmetic operations (plus, minus and neg). The patched compiler can emit SSE vectorized QImode operations (e.g. PADDB) with partial QImode vector, and also synthesized double HI/LO QImode operations with integer registers. The testcase: typedef char __v2qi __attribute__ ((__vector_size__ (2))); __v2qi plus (__v2qi a, __v2qi b) { return a + b; }; compiles with -O2 to: movl %edi, %edx movl %esi, %eax addb %sil, %dl addb %ah, %dh movl %edx, %eax ret which is much better than what the unpatched compiler produces: movl %edi, %eax movl %esi, %edx xorl %ecx, %ecx movb %dil, %cl movsbl %dh, %edx movsbl %ah, %eax addl %edx, %eax addb %sil, %cl movb %al, %ch movl %ecx, %eax ret The V2QImode vectorization does not require vector registers, so it can be enabled by default also for 32-bit targets without SSE. The patch also enables vectorized V2QImode sign/zero extends. 2021-12-30 Uroš Bizjak <ubizjak@gmail.com> gcc/ChangeLog: PR target/103861 * config/i386/i386.h (VALID_SSE2_REG_MODE): Add V2QImode. (VALID_INT_MODE_P): Ditto. * config/i386/i386.c (ix86_secondary_reload): Handle V2QImode reloads from SSE register to memory. (vector_mode_supported_p): Always return true for V2QImode. * config/i386/i386.md (*subqi_ext<mode>_2): New insn pattern. (*negqi_ext<mode>_2): Ditto. * config/i386/mmx.md (movv2qi): New expander. (movmisalignv2qi): Ditto. (*movv2qi_internal): New insn pattern. (*pushv2qi2): Ditto. (negv2qi2 and splitters): Ditto. (<plusminus:insn>v2qi3 and splitters): Ditto. gcc/testsuite/ChangeLog: PR target/103861 * gcc.dg/store_merging_18.c (dg-options): Add -fno-tree-vectorize. * gcc.dg/store_merging_29.c (dg-options): Ditto. * gcc.target/i386/pr103861.c: New test. * gcc.target/i386/pr92658-avx512vl.c (dg-final): Remove vpmovqb scan-assembler xfail. * gcc.target/i386/pr92658-sse4.c (dg-final): Remove pmovzxbq scan-assembler xfail. * gcc.target/i386/pr92658-sse4-2.c (dg-final): Remove pmovsxbq scan-assembler xfail. * gcc.target/i386/warn-vect-op-2.c (dg-warning): Adjust warnings.
Diffstat (limited to 'gcc/config/i386')
-rw-r--r--gcc/config/i386/i386.c4
-rw-r--r--gcc/config/i386/i386.h5
-rw-r--r--gcc/config/i386/i386.md43
-rw-r--r--gcc/config/i386/mmx.md284
4 files changed, 331 insertions, 5 deletions
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ec15582..4e02b26 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19306,7 +19306,7 @@ ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
}
/* Require movement to gpr, and then store to memory. */
- if ((mode == HFmode || mode == HImode)
+ if ((mode == HFmode || mode == HImode || mode == V2QImode)
&& !TARGET_SSE4_1
&& SSE_CLASS_P (rclass)
&& !in_p && MEM_P (x))
@@ -22082,6 +22082,8 @@ ix86_vector_mode_supported_p (machine_mode mode)
if ((TARGET_3DNOW || TARGET_MMX_WITH_SSE)
&& VALID_MMX_REG_MODE_3DNOW (mode))
return true;
+ if (mode == V2QImode)
+ return true;
return false;
}
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 398f751..3adb1cb 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1039,7 +1039,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode \
|| (MODE) == V8HFmode || (MODE) == V4HFmode || (MODE) == V2HFmode \
|| (MODE) == V4QImode || (MODE) == V2HImode || (MODE) == V1SImode \
- || (MODE) == V2DImode || (MODE) == DFmode || (MODE) == HFmode)
+ || (MODE) == V2DImode || (MODE) == V2QImode || (MODE) == DFmode \
+ || (MODE) == HFmode)
#define VALID_SSE_REG_MODE(MODE) \
((MODE) == V1TImode || (MODE) == TImode \
@@ -1072,7 +1073,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
|| (MODE) == SDmode || (MODE) == DDmode \
|| (MODE) == HFmode || (MODE) == HCmode \
|| (MODE) == V2HImode || (MODE) == V2HFmode \
- || (MODE) == V1SImode || (MODE) == V4QImode \
+ || (MODE) == V1SImode || (MODE) == V4QImode || (MODE) == V2QImode \
|| (TARGET_64BIT \
&& ((MODE) == TImode || (MODE) == CTImode \
|| (MODE) == TFmode || (MODE) == TCmode \
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e670e7d..cd95509 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -6931,6 +6931,30 @@
operands[4] = gen_rtx_SIGN_EXTEND (<DPWI>mode, operands[2]);
})
+(define_insn "*subqi_ext<mode>_2"
+ [(set (zero_extract:SWI248
+ (match_operand:SWI248 0 "register_operand" "+Q")
+ (const_int 8)
+ (const_int 8))
+ (subreg:SWI248
+ (minus:QI
+ (subreg:QI
+ (zero_extract:SWI248
+ (match_operand:SWI248 1 "register_operand" "0")
+ (const_int 8)
+ (const_int 8)) 0)
+ (subreg:QI
+ (zero_extract:SWI248
+ (match_operand:SWI248 2 "register_operand" "Q")
+ (const_int 8)
+ (const_int 8)) 0)) 0))
+ (clobber (reg:CC FLAGS_REG))]
+ "/* FIXME: without this LRA can't reload this pattern, see PR82524. */
+ rtx_equal_p (operands[0], operands[1])"
+ "sub{b}\t{%h2, %h0|%h0, %h2}"
+ [(set_attr "type" "alu")
+ (set_attr "mode" "QI")])
+
(define_insn "*subv<mode>4"
[(set (reg:CCO FLAGS_REG)
(eq:CCO (minus:<DWI>
@@ -10901,6 +10925,25 @@
[(set_attr "type" "negnot")
(set_attr "mode" "<MODE>")])
+(define_insn "*negqi_ext<mode>_2"
+ [(set (zero_extract:SWI248
+ (match_operand:SWI248 0 "register_operand" "+Q")
+ (const_int 8)
+ (const_int 8))
+ (subreg:SWI248
+ (neg:QI
+ (subreg:QI
+ (zero_extract:SWI248
+ (match_operand:SWI248 1 "register_operand" "0")
+ (const_int 8)
+ (const_int 8)) 0)) 0))
+ (clobber (reg:CC FLAGS_REG))]
+ "/* FIXME: without this LRA can't reload this pattern, see PR82524. */
+ rtx_equal_p (operands[0], operands[1])"
+ "neg{b}\t%h0"
+ [(set_attr "type" "negnot")
+ (set_attr "mode" "QI")])
+
;; Negate with jump on overflow.
(define_expand "negv<mode>3"
[(parallel [(set (reg:CCO FLAGS_REG)
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index e394cba..c4e71c2 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -261,8 +261,8 @@
"=r ,m ,v,v,v,m,r,v")
(match_operand:V_32 1 "general_operand"
"rmC,rC,C,v,m,v,v,r"))]
- "TARGET_SSE2 &&
- !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+ "TARGET_SSE2
+ && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
{
switch (get_attr_type (insn))
{
@@ -359,6 +359,174 @@
DONE;
})
+(define_expand "movv2qi"
+ [(set (match_operand:V2QI 0 "nonimmediate_operand")
+ (match_operand:V2QI 1 "nonimmediate_operand"))]
+ ""
+{
+ ix86_expand_vector_move (V2QImode, operands);
+ DONE;
+})
+
+(define_insn "*movv2qi_internal"
+ [(set (match_operand:V2QI 0 "nonimmediate_operand"
+ "=r,r,r,m ,v,v,v,m,r,v")
+ (match_operand:V2QI 1 "general_operand"
+ "r ,C,m,rC,C,v,m,v,v,r"))]
+ "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
+{
+ switch (get_attr_type (insn))
+ {
+ case TYPE_IMOV:
+ if (get_attr_mode (insn) == MODE_SI)
+ return "mov{l}\t{%k1, %k0|%k0, %k1}";
+ else
+ return "mov{w}\t{%1, %0|%0, %1}";
+
+ case TYPE_IMOVX:
+ /* movzwl is faster than movw on p2 due to partial word stalls,
+ though not as fast as an aligned movl. */
+ return "movz{wl|x}\t{%1, %k0|%k0, %1}";
+
+ case TYPE_SSELOG1:
+ if (satisfies_constraint_C (operands[1]))
+ return standard_sse_constant_opcode (insn, operands);
+
+ if (SSE_REG_P (operands[0]))
+ return MEM_P (operands[1])
+ ? "%vpinsrw\t{$0, %1, %d0|%d0, %1, 0}"
+ : "%vpinsrw\t{$0, %k1, %d0|%d0, %k1, 0}";
+ else
+ return MEM_P (operands[0])
+ ? "%vpextrw\t{$0, %1, %0|%0, %1, 0}"
+ : "%vpextrw\t{$0, %1, %k0|%k0, %1, 0}";
+
+ case TYPE_SSEMOV:
+ return ix86_output_ssemov (insn, operands);
+
+ default:
+ gcc_unreachable ();
+ }
+}
+ [(set (attr "isa")
+ (cond [(eq_attr "alternative" "4,5,6,8,9")
+ (const_string "sse2")
+ (eq_attr "alternative" "7")
+ (const_string "sse4")
+ ]
+ (const_string "*")))
+ (set (attr "type")
+ (cond [(eq_attr "alternative" "6,7,8,9")
+ (if_then_else (match_test "TARGET_AVX512FP16")
+ (const_string "ssemov")
+ (const_string "sselog1"))
+ (eq_attr "alternative" "4")
+ (const_string "sselog1")
+ (eq_attr "alternative" "5")
+ (const_string "ssemov")
+ (match_test "optimize_function_for_size_p (cfun)")
+ (const_string "imov")
+ (and (eq_attr "alternative" "0")
+ (ior (not (match_test "TARGET_PARTIAL_REG_STALL"))
+ (not (match_test "TARGET_HIMODE_MATH"))))
+ (const_string "imov")
+ (and (eq_attr "alternative" "1,2")
+ (match_operand:V2QI 1 "aligned_operand"))
+ (const_string "imov")
+ (and (match_test "TARGET_MOVX")
+ (eq_attr "alternative" "0,2"))
+ (const_string "imovx")
+ ]
+ (const_string "imov")))
+ (set (attr "prefix")
+ (cond [(eq_attr "alternative" "4,5,6,7,8,9")
+ (const_string "maybe_evex")
+ ]
+ (const_string "orig")))
+ (set (attr "mode")
+ (cond [(eq_attr "alternative" "6,7,8,9")
+ (if_then_else (match_test "TARGET_AVX512FP16")
+ (const_string "HI")
+ (const_string "TI"))
+ (eq_attr "alternative" "4")
+ (cond [(match_test "TARGET_AVX")
+ (const_string "TI")
+ (ior (not (match_test "TARGET_SSE2"))
+ (match_test "optimize_function_for_size_p (cfun)"))
+ (const_string "V4SF")
+ ]
+ (const_string "TI"))
+ (eq_attr "alternative" "5")
+ (cond [(match_test "TARGET_AVX512FP16")
+ (const_string "HI")
+ (match_test "TARGET_AVX")
+ (const_string "TI")
+ (ior (not (match_test "TARGET_SSE2"))
+ (match_test "optimize_function_for_size_p (cfun)"))
+ (const_string "V4SF")
+ ]
+ (const_string "TI"))
+ (eq_attr "type" "imovx")
+ (const_string "SI")
+ (and (eq_attr "alternative" "1,2")
+ (match_operand:V2QI 1 "aligned_operand"))
+ (const_string "SI")
+ (and (eq_attr "alternative" "0")
+ (ior (not (match_test "TARGET_PARTIAL_REG_STALL"))
+ (not (match_test "TARGET_HIMODE_MATH"))))
+ (const_string "SI")
+ ]
+ (const_string "HI")))
+ (set (attr "preferred_for_speed")
+ (cond [(eq_attr "alternative" "8")
+ (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC")
+ (eq_attr "alternative" "9")
+ (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC")
+ ]
+ (symbol_ref "true")))])
+
+;; We always round up to UNITS_PER_WORD bytes.
+(define_insn "*pushv2qi2"
+ [(set (match_operand:V2QI 0 "push_operand" "=X,X")
+ (match_operand:V2QI 1 "nonmemory_no_elim_operand" "rC,v"))]
+ ""
+ "* return TARGET_64BIT ? \"push{q}\t%q1\" : \"push{l}\t%k1\";
+ #"
+ [(set_attr "isa" "*,sse4")
+ (set_attr "type" "push,multi")
+ (set (attr "mode")
+ (cond [(eq_attr "alternative" "0")
+ (if_then_else (match_test "TARGET_64BIT")
+ (const_string "DI")
+ (const_string "SI"))
+ (eq_attr "alternative" "1")
+ (if_then_else (match_test "TARGET_AVX512FP16")
+ (const_string "HI")
+ (const_string "TI"))
+ ]
+ (const_string "HI")))])
+
+(define_split
+ [(set (match_operand:V2QI 0 "push_operand")
+ (match_operand:V2QI 1 "sse_reg_operand"))]
+ "TARGET_SSE4_1 && reload_completed"
+ [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2)))
+ (set (match_dup 0) (match_dup 1))]
+{
+ operands[2] = GEN_INT (-PUSH_ROUNDING (GET_MODE_SIZE (V2QImode)));
+ /* Preserve memory attributes. */
+ operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx);
+})
+
+(define_expand "movmisalignv2qi"
+ [(set (match_operand:V2QI 0 "nonimmediate_operand")
+ (match_operand:V2QI 1 "nonimmediate_operand"))]
+ ""
+{
+ ix86_expand_vector_move (V2QImode, operands);
+ DONE;
+})
+
(define_insn "sse_movntq"
[(set (match_operand:DI 0 "memory_operand" "=m,m")
(unspec:DI [(match_operand:DI 1 "register_operand" "y,r")]
@@ -1461,6 +1629,58 @@
"TARGET_MMX_WITH_SSE"
"operands[2] = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));")
+(define_insn "negv2qi2"
+ [(set (match_operand:V2QI 0 "register_operand" "=Q,&Yw")
+ (neg:V2QI
+ (match_operand:V2QI 1 "register_operand" "0,Yw")))
+ (clobber (reg:CC FLAGS_REG))]
+ ""
+ "#"
+ [(set_attr "isa" "*,sse2")
+ (set_attr "type" "multi")
+ (set_attr "mode" "QI,TI")])
+
+(define_split
+ [(set (match_operand:V2QI 0 "general_reg_operand")
+ (neg:V2QI
+ (match_operand:V2QI 1 "general_reg_operand")))
+ (clobber (reg:CC FLAGS_REG))]
+ "reload_completed"
+ [(parallel
+ [(set (strict_low_part (match_dup 0))
+ (neg:QI (match_dup 1)))
+ (clobber (reg:CC FLAGS_REG))])
+ (parallel
+ [(set (zero_extract:HI (match_dup 2) (const_int 8) (const_int 8))
+ (subreg:HI
+ (neg:QI
+ (subreg:QI
+ (zero_extract:HI (match_dup 3)
+ (const_int 8)
+ (const_int 8)) 0)) 0))
+ (clobber (reg:CC FLAGS_REG))])]
+{
+ operands[3] = gen_lowpart (HImode, operands[1]);
+ operands[2] = gen_lowpart (HImode, operands[0]);
+ operands[1] = gen_lowpart (QImode, operands[1]);
+ operands[0] = gen_lowpart (QImode, operands[0]);
+})
+
+(define_split
+ [(set (match_operand:V2QI 0 "sse_reg_operand")
+ (neg:V2QI
+ (match_operand:V2QI 1 "sse_reg_operand")))
+ (clobber (reg:CC FLAGS_REG))]
+ "reload_completed"
+ [(set (match_dup 0) (match_dup 2))
+ (set (match_dup 0)
+ (minus:V4QI (match_dup 0) (match_dup 1)))]
+{
+ operands[2] = CONST0_RTX (V4QImode);
+ operands[1] = gen_lowpart (V4QImode, operands[1]);
+ operands[0] = gen_lowpart (V4QImode, operands[0]);
+})
+
(define_expand "mmx_<insn><mode>3"
[(set (match_operand:MMXMODEI8 0 "register_operand")
(plusminus:MMXMODEI8
@@ -1515,6 +1735,66 @@
(set_attr "type" "sseadd")
(set_attr "mode" "TI")])
+(define_insn "<insn>v2qi3"
+ [(set (match_operand:V2QI 0 "register_operand" "=Q,x,Yw")
+ (plusminus:V2QI
+ (match_operand:V2QI 1 "register_operand" "<comm>0,0,Yw")
+ (match_operand:V2QI 2 "register_operand" "Q,x,Yw")))
+ (clobber (reg:CC FLAGS_REG))]
+ ""
+ "#"
+ [(set_attr "isa" "*,sse2_noavx,avx")
+ (set_attr "type" "multi,sseadd,sseadd")
+ (set_attr "mode" "QI,TI,TI")])
+
+(define_split
+ [(set (match_operand:V2QI 0 "general_reg_operand")
+ (plusminus:V2QI
+ (match_operand:V2QI 1 "general_reg_operand")
+ (match_operand:V2QI 2 "general_reg_operand")))
+ (clobber (reg:CC FLAGS_REG))]
+ "reload_completed"
+ [(parallel
+ [(set (strict_low_part (match_dup 0))
+ (plusminus:QI (match_dup 1) (match_dup 2)))
+ (clobber (reg:CC FLAGS_REG))])
+ (parallel
+ [(set (zero_extract:HI (match_dup 3) (const_int 8) (const_int 8))
+ (subreg:HI
+ (plusminus:QI
+ (subreg:QI
+ (zero_extract:HI (match_dup 4)
+ (const_int 8)
+ (const_int 8)) 0)
+ (subreg:QI
+ (zero_extract:HI (match_dup 5)
+ (const_int 8)
+ (const_int 8)) 0)) 0))
+ (clobber (reg:CC FLAGS_REG))])]
+{
+ operands[5] = gen_lowpart (HImode, operands[2]);
+ operands[4] = gen_lowpart (HImode, operands[1]);
+ operands[3] = gen_lowpart (HImode, operands[0]);
+ operands[2] = gen_lowpart (QImode, operands[2]);
+ operands[1] = gen_lowpart (QImode, operands[1]);
+ operands[0] = gen_lowpart (QImode, operands[0]);
+})
+
+(define_split
+ [(set (match_operand:V2QI 0 "sse_reg_operand")
+ (plusminus:V2QI
+ (match_operand:V2QI 1 "sse_reg_operand")
+ (match_operand:V2QI 2 "sse_reg_operand")))
+ (clobber (reg:CC FLAGS_REG))]
+ "TARGET_SSE2 && reload_completed"
+ [(set (match_dup 0)
+ (plusminus:V4QI (match_dup 1) (match_dup 2)))]
+{
+ operands[2] = gen_lowpart (V4QImode, operands[2]);
+ operands[1] = gen_lowpart (V4QImode, operands[1]);
+ operands[0] = gen_lowpart (V4QImode, operands[0]);
+})
+
(define_expand "mmx_<insn><mode>3"
[(set (match_operand:MMXMODE12 0 "register_operand")
(sat_plusminus:MMXMODE12