diff options
Diffstat (limited to 'gcc/config/i386')
25 files changed, 1325 insertions, 171 deletions
diff --git a/gcc/config/i386/amxavx512intrin.h b/gcc/config/i386/amxavx512intrin.h index ab53625..1e28460 100644 --- a/gcc/config/i386/amxavx512intrin.h +++ b/gcc/config/i386/amxavx512intrin.h @@ -39,8 +39,9 @@ ({ \ __m512 dst; \ __asm__ volatile \ - ("{tcvtrowd2ps\t%1, %%tmm"#src", %0|tcvtrowd2ps\t%0, %%tmm"#src", %1}" \ - : "=v" (dst) : "r" ((unsigned) (A))); \ + ("{tcvtrowd2ps\t%1, %%tmm%c[_src], %0 \ + |tcvtrowd2ps\t%0, tmm%c[_src], %1}" \ + : "=v" (dst) : "r" ((unsigned) (A)), [_src]"i"(src)); \ dst; \ }) @@ -48,8 +49,9 @@ ({ \ __m512 dst; \ __asm__ volatile \ - ("{tcvtrowd2ps\t$"#imm", %%tmm"#src", %0|tcvtrowd2ps\t%0, %%tmm"#src", "#imm"}" \ - : "=v" (dst) :); \ + ("{tcvtrowd2ps\t%[_imm], %%tmm%c[_src], %0 \ + |tcvtrowd2ps\t%0, tmm%c[_src], %[_imm]}" \ + : "=v" (dst) : [_src]"i"(src), [_imm]"i"(imm)); \ dst; \ }) @@ -57,8 +59,9 @@ ({ \ __m512bh dst; \ __asm__ volatile \ - ("{tcvtrowps2bf16h\t%1, %%tmm"#src", %0|tcvtrowps2bf16h\t%0, %%tmm"#src", %1}" \ - : "=v" (dst) : "r" ((unsigned) (A))); \ + ("{tcvtrowps2bf16h\t%1, %%tmm%c[_src], %0 \ + |tcvtrowps2bf16h\t%0, tmm%c[_src], %1}" \ + : "=v" (dst) : "r" ((unsigned) (A)), [_src]"i"(src)); \ dst; \ }) @@ -66,8 +69,9 @@ ({ \ __m512bh dst; \ __asm__ volatile \ - ("{tcvtrowps2bf16h\t$"#imm", %%tmm"#src", %0|tcvtrowps2bf16h\t%0, %%tmm"#src", "#imm"}" \ - : "=v" (dst) :); \ + ("{tcvtrowps2bf16h\t%[_imm], %%tmm%c[_src], %0 \ + |tcvtrowps2bf16h\t%0, tmm%c[_src], %[_imm]}" \ + : "=v" (dst) : [_src]"i"(src), [_imm]"i"(imm)); \ dst; \ }) @@ -75,8 +79,9 @@ ({ \ __m512bh dst; \ __asm__ volatile \ - ("{tcvtrowps2bf16l\t%1, %%tmm"#src", %0|tcvtrowps2bf16l\t%0, %%tmm"#src", %1}" \ - : "=v" (dst) : "r" ((unsigned) (A))); \ + ("{tcvtrowps2bf16l\t%1, %%tmm%c[_src], %0 \ + |tcvtrowps2bf16l\t%0, tmm%c[_src], %1}" \ + : "=v" (dst) : "r" ((unsigned) (A)), [_src]"i"(src)); \ dst; \ }) @@ -84,8 +89,9 @@ ({ \ __m512bh dst; \ __asm__ volatile \ - ("{tcvtrowps2bf16l\t$"#imm", %%tmm"#src", %0|tcvtrowps2bf16l\t%0, %%tmm"#src", "#imm"}" \ - : "=v" (dst) :); \ + ("{tcvtrowps2bf16l\t%[_imm], %%tmm%c[_src], %0 \ + |tcvtrowps2bf16l\t%0, tmm%c[_src], "#imm"}" \ + : "=v" (dst) : [_src]"i"(src), [_imm]"i"(imm)); \ dst; \ }) @@ -93,8 +99,8 @@ ({ \ __m512h dst; \ __asm__ volatile \ - ("{tcvtrowps2phh\t%1, %%tmm"#src", %0|tcvtrowps2phh\t%0, %%tmm"#src", %1}" \ - : "=v" (dst) : "r" ((unsigned) (A))); \ + ("{tcvtrowps2phh\t%1, %%tmm%c[_src], %0|tcvtrowps2phh\t%0, tmm%c[_src], %1}" \ + : "=v" (dst) : "r" ((unsigned) (A)), [_src]"i"(src)); \ dst; \ }) @@ -102,8 +108,9 @@ ({ \ __m512h dst; \ __asm__ volatile \ - ("{tcvtrowps2phh\t$"#imm", %%tmm"#src", %0|tcvtrowps2phh\t%0, %%tmm"#src", "#imm"}" \ - : "=v" (dst) :); \ + ("{tcvtrowps2phh\t%[_imm], %%tmm%c[_src], %0 \ + |tcvtrowps2phh\t%0, tmm%c[_src], "#imm"}" \ + : "=v" (dst) : [_src]"i"(src), [_imm]"i"(imm)); \ dst; \ }) @@ -111,8 +118,8 @@ ({ \ __m512h dst; \ __asm__ volatile \ - ("{tcvtrowps2phl\t%1, %%tmm"#src", %0|tcvtrowps2phl\t%0, %%tmm"#src", %1}" \ - : "=v" (dst) : "r" ((unsigned) (A))); \ + ("{tcvtrowps2phl\t%1, %%tmm%c[_src], %0|tcvtrowps2phl\t%0, tmm%c[_src], %1}" \ + : "=v" (dst) : "r" ((unsigned) (A)), [_src]"i"(src)); \ dst; \ }) @@ -120,8 +127,9 @@ ({ \ __m512h dst; \ __asm__ volatile \ - ("{tcvtrowps2phl\t$"#imm", %%tmm"#src", %0|tcvtrowps2phl\t%0, %%tmm"#src", "#imm"}" \ - : "=v" (dst) :); \ + ("{tcvtrowps2phl\t%[_imm], %%tmm%c[_src], %0 \ + |tcvtrowps2phl\t%0, tmm%c[_src], "#imm"}" \ + : "=v" (dst) : [_src]"i"(src), [_imm]"i"(imm)); \ dst; \ }) @@ -129,8 +137,8 @@ ({ \ __m512 dst; \ __asm__ volatile \ - ("{tilemovrow\t%1, %%tmm"#src", %0|tilemovrow\t%0, %%tmm"#src", %1}" \ - : "=v" (dst) : "r" ((unsigned) (A))); \ + ("{tilemovrow\t%1, %%tmm%c[_src], %0|tilemovrow\t%0, tmm%c[_src], %1}" \ + : "=v" (dst) : "r" ((unsigned) (A)), [_src]"i"(src)); \ dst; \ }) @@ -138,8 +146,9 @@ ({ \ __m512 dst; \ __asm__ volatile \ - ("{tilemovrow\t$"#imm", %%tmm"#src", %0|tilemovrow\t%0, %%tmm"#src", "#imm"}" \ - : "=v" (dst) :); \ + ("{tilemovrow\t%[_imm], %%tmm%c[_src], %0 \ + |tilemovrow\t%0, tmm%c[_src], "#imm"}" \ + : "=v" (dst) : [_src]"i"(src), [_imm]"i"(imm)); \ dst; \ }) diff --git a/gcc/config/i386/amxbf16intrin.h b/gcc/config/i386/amxbf16intrin.h index 9f4a9d1..b2792bb 100644 --- a/gcc/config/i386/amxbf16intrin.h +++ b/gcc/config/i386/amxbf16intrin.h @@ -36,8 +36,10 @@ #if defined(__x86_64__) #define _tile_dpbf16ps_internal(dst,src1,src2) \ - __asm__ volatile\ - ("{tdpbf16ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpbf16ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) + __asm__ volatile \ + ("{tdpbf16ps\t%%tmm%c[_src2], %%tmm%c[_src1], %%tmm%c[_dst] \ + |tdpbf16ps\ttmm%c[_dst], tmm%c[_src1], tmm%c[_src2]}" \ + :: [_dst]"i"(dst), [_src1]"i"(src1), [_src2]"i"(src2)) #define _tile_dpbf16ps(dst,src1,src2) \ _tile_dpbf16ps_internal (dst, src1, src2) diff --git a/gcc/config/i386/amxcomplexintrin.h b/gcc/config/i386/amxcomplexintrin.h index fc5964f..55b7d53 100644 --- a/gcc/config/i386/amxcomplexintrin.h +++ b/gcc/config/i386/amxcomplexintrin.h @@ -35,13 +35,17 @@ #endif /* __AMX_COMPLEX__ */ #if defined(__x86_64__) -#define _tile_cmmimfp16ps_internal(src1_dst,src2,src3) \ - __asm__ volatile\ - ("{tcmmimfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tcmmimfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::) - -#define _tile_cmmrlfp16ps_internal(src1_dst,src2,src3) \ - __asm__ volatile\ - ("{tcmmrlfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tcmmrlfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::) +#define _tile_cmmimfp16ps_internal(src1_dst,src2,src3) \ + __asm__ volatile \ + ("{tcmmimfp16ps\t%%tmm%c[_src3], %%tmm%c[_src2], %%tmm%c[_src1_dst] \ + |tcmmimfp16ps\ttmm%c[_src1_dst], tmm%c[_src2], tmm%c[_src3]}" \ + :: [_src1_dst]"i"(src1_dst), [_src2]"i"(src2), [_src3]"i"(src3)) + +#define _tile_cmmrlfp16ps_internal(src1_dst,src2,src3) \ + __asm__ volatile \ + ("{tcmmrlfp16ps\t%%tmm%c[_src3], %%tmm%c[_src2], %%tmm%c[_src1_dst] \ + |tcmmrlfp16ps\ttmm%c[_src1_dst], tmm%c[_src2], tmm%c[_src3]}" \ + :: [_src1_dst]"i"(src1_dst), [_src2]"i"(src2), [_src3]"i"(src3)) #define _tile_cmmimfp16ps(src1_dst,src2,src3) \ _tile_cmmimfp16ps_internal (src1_dst, src2, src3) diff --git a/gcc/config/i386/amxfp16intrin.h b/gcc/config/i386/amxfp16intrin.h index 02fd031..1e0ef27 100644 --- a/gcc/config/i386/amxfp16intrin.h +++ b/gcc/config/i386/amxfp16intrin.h @@ -29,9 +29,11 @@ #define _AMXFP16INTRIN_H_INCLUDED #if defined(__x86_64__) -#define _tile_dpfp16ps_internal(dst,src1,src2) \ - __asm__ volatile \ - ("{tdpfp16ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpfp16ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) +#define _tile_dpfp16ps_internal(dst,src1,src2) \ + __asm__ volatile \ + ("{tdpfp16ps\t%%tmm%c[_src2], %%tmm%c[_src1], %%tmm%c[_dst] \ + |tdpfp16ps\ttmm%c[_dst], tmm%c[_src1], tmm%c[_src2]}" \ + :: [_dst]"i"(dst), [_src1]"i"(src1), [_src2]"i"(src2)) #define _tile_dpfp16ps(dst,src1,src2) \ _tile_dpfp16ps_internal (dst,src1,src2) diff --git a/gcc/config/i386/amxfp8intrin.h b/gcc/config/i386/amxfp8intrin.h index 8952be9..9467f53 100644 --- a/gcc/config/i386/amxfp8intrin.h +++ b/gcc/config/i386/amxfp8intrin.h @@ -29,21 +29,29 @@ #define _AMXFP8INTRIN_H_INCLUDED #if defined(__x86_64__) -#define _tile_dpbf8ps_internal(dst,src1,src2) \ - __asm__ volatile \ - ("{tdpbf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpbf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) - -#define _tile_dpbhf8ps_internal(dst,src1,src2) \ - __asm__ volatile \ - ("{tdpbhf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpbhf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) - -#define _tile_dphbf8ps_internal(dst,src1,src2) \ - __asm__ volatile \ - ("{tdphbf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdphbf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) - -#define _tile_dphf8ps_internal(dst,src1,src2) \ - __asm__ volatile \ - ("{tdphf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdphf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) +#define _tile_dpbf8ps_internal(dst,src1,src2) \ + __asm__ volatile \ + ("{tdpbf8ps\t%%tmm%c[_src2], %%tmm%c[_src1], %%tmm%c[_dst] \ + |tdpbf8ps\ttmm%c[_dst], tmm%c[_src1], tmm%c[_src2]}" \ + :: [_dst]"i"(dst), [_src1]"i"(src1), [_src2]"i"(src2)) + +#define _tile_dpbhf8ps_internal(dst,src1,src2) \ + __asm__ volatile \ + ("{tdpbhf8ps\t%%tmm%c[_src2], %%tmm%c[_src1], %%tmm%c[_dst] \ + |tdpbhf8ps\ttmm%c[_dst], tmm%c[_src1], tmm%c[_src2]}" \ + :: [_dst]"i"(dst), [_src1]"i"(src1), [_src2]"i"(src2)) + +#define _tile_dphbf8ps_internal(dst,src1,src2) \ + __asm__ volatile \ + ("{tdphbf8ps\t%%tmm%c[_src2], %%tmm%c[_src1], %%tmm%c[_dst] \ + |tdphbf8ps\ttmm%c[_dst], tmm%c[_src1], tmm%c[_src2]}" \ + :: [_dst]"i"(dst), [_src1]"i"(src1), [_src2]"i"(src2)) + +#define _tile_dphf8ps_internal(dst,src1,src2) \ + __asm__ volatile \ + ("{tdphf8ps\t%%tmm%c[_src2], %%tmm%c[_src1], %%tmm%c[_dst] \ + |tdphf8ps\ttmm%c[_dst], tmm%c[_src1], tmm%c[_src2]}" \ + :: [_dst]"i"(dst), [_src1]"i"(src1), [_src2]"i"(src2)) #define _tile_dpbf8ps(dst,src1,src2) \ _tile_dpbf8ps_internal (dst,src1,src2) diff --git a/gcc/config/i386/amxint8intrin.h b/gcc/config/i386/amxint8intrin.h index 332c8db..f7cb36c 100644 --- a/gcc/config/i386/amxint8intrin.h +++ b/gcc/config/i386/amxint8intrin.h @@ -37,7 +37,9 @@ #if defined(__x86_64__) #define _tile_int8_dp_internal(name,dst,src1,src2) \ __asm__ volatile \ - ("{"#name"\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|"#name"\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) + ("{"#name"\t%%tmm%c[_src2], %%tmm%c[_src1], %%tmm%c[_dst] \ + |"#name"\ttmm%c[_dst], tmm%c[_src1], tmm%c[_src2]}" \ + ::[_dst]"i"(dst),[_src1]"i"(src1),[_src2]"i"(src2)) #define _tile_dpbssd(dst,src1,src2) \ _tile_int8_dp_internal (tdpbssd, dst, src1, src2) diff --git a/gcc/config/i386/amxmovrsintrin.h b/gcc/config/i386/amxmovrsintrin.h index 93a2dbf..9f5d317 100644 --- a/gcc/config/i386/amxmovrsintrin.h +++ b/gcc/config/i386/amxmovrsintrin.h @@ -36,17 +36,17 @@ #define __DISABLE_AMX_MOVRS__ #endif /* __AMX_MOVRS__ */ -#define _tile_loaddrs_internal(tdst, base, stride) \ -__asm__ volatile \ - ("{tileloaddrs\t(%0,%1,1), %%tmm"#tdst \ - "|tileloaddrs\t%%tmm"#tdst", [%0+%1*1]}" \ - :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride))) +#define _tile_loaddrs_internal(tdst, base, stride) \ +__asm__ volatile \ + ("{tileloaddrs\t(%0,%1,1), %%tmm%c[_tdst] \ + |tileloaddrs\ttmm%c[_tdst], [%0+%1*1]}" \ + :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)), [_tdst]"i"(tdst)) -#define _tile_loaddrst1_internal(tdst, base, stride) \ -__asm__ volatile \ - ("{tileloaddrst1\t(%0,%1,1), %%tmm"#tdst \ - "|tileloaddrst1\t%%tmm"#tdst", [%0+%1*1]}" \ - :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride))) +#define _tile_loaddrst1_internal(tdst, base, stride) \ +__asm__ volatile \ + ("{tileloaddrst1\t(%0,%1,1), %%tmm%c[_tdst] \ + |tileloaddrst1\ttmm%c[_tdst], [%0+%1*1]}" \ + :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)), [_tdst]"i"(tdst)) #define _tile_loaddrs(tdst, base, stride) \ _tile_loaddrs_internal(tdst, base, stride) diff --git a/gcc/config/i386/amxtf32intrin.h b/gcc/config/i386/amxtf32intrin.h index 8ed910d..a7a1f4f 100644 --- a/gcc/config/i386/amxtf32intrin.h +++ b/gcc/config/i386/amxtf32intrin.h @@ -31,8 +31,10 @@ #if defined(__x86_64__) #define _tile_mmultf32ps_internal(src1_dst,src2,src3) \ - __asm__ volatile\ - ("{tmmultf32ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tmmultf32ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::) + __asm__ volatile \ + ("{tmmultf32ps\t%%tmm%c[_src3], %%tmm%c[_src2], %%tmm%c[_src1_dst] \ + |tmmultf32ps\ttmm%c[_src1_dst], tmm%c[_src2], tmm%c[_src3]}" \ + :: [_src1_dst]"i"(src1_dst), [_src2]"i"(src2), [_src3]"i"(src3)) #define _tile_mmultf32ps(src1_dst,src2,src3) \ _tile_mmultf32ps_internal (src1_dst, src2, src3) diff --git a/gcc/config/i386/amxtileintrin.h b/gcc/config/i386/amxtileintrin.h index 8c8e2cd..67c6b53 100644 --- a/gcc/config/i386/amxtileintrin.h +++ b/gcc/config/i386/amxtileintrin.h @@ -61,32 +61,32 @@ _tile_release (void) #define _tile_loadd_internal(dst,base,stride) \ __asm__ volatile \ - ("{tileloadd\t(%0,%1,1), %%tmm"#dst"|tileloadd\t%%tmm"#dst", [%0+%1*1]}" \ - :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride))) + ("{tileloadd\t(%0,%1,1), %%tmm%c[_dst]|tileloadd\ttmm%c[_dst], [%0+%1*1]}" \ + :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)), [_dst]"i"(dst)) #define _tile_stream_loadd(dst,base,stride) \ _tile_stream_loadd_internal (dst, base, stride) #define _tile_stream_loadd_internal(dst,base,stride) \ __asm__ volatile \ - ("{tileloaddt1\t(%0,%1,1), %%tmm"#dst"|tileloaddt1\t%%tmm"#dst", [%0+%1*1]}" \ - :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride))) + ("{tileloaddt1\t(%0,%1,1), %%tmm%c[_dst]|tileloaddt1\ttmm%c[_dst], [%0+%1*1]}" \ + :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)), [_dst]"i"(dst)) #define _tile_stored(dst,base,stride) \ _tile_stored_internal (dst, base, stride) #define _tile_stored_internal(src,base,stride) \ __asm__ volatile \ - ("{tilestored\t%%tmm"#src", (%0,%1,1)|tilestored\t[%0+%1*1], %%tmm"#src"}" \ - :: "r" ((void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)) \ - : "memory") + ("{tilestored\t%%tmm%c[_src], (%0,%1,1)|tilestored\t[%0+%1*1], tmm%c[_src]}" \ + :: "r" ((void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)), [_src]"i"(src) \ + : "memory") #define _tile_zero(dst) \ _tile_zero_internal (dst) -#define _tile_zero_internal(dst) \ - __asm__ volatile \ - ("tilezero\t%%tmm"#dst ::) +#define _tile_zero_internal(dst) \ + __asm__ volatile \ + ("{tilezero\t%%tmm%c[_dst]|tilezero\ttmm%c[_dst]}" :: [_dst]"i"(dst)) #endif diff --git a/gcc/config/i386/driver-i386.cc b/gcc/config/i386/driver-i386.cc index 0557df9..b54f0af 100644 --- a/gcc/config/i386/driver-i386.cc +++ b/gcc/config/i386/driver-i386.cc @@ -603,6 +603,9 @@ const char *host_detect_local_cpu (int argc, const char **argv) /* Assume Diamond Rapids. */ if (has_feature (FEATURE_AMX_FP8)) cpu = "diamondrapids"; + /* Assume Nova Lake. */ + else if (has_feature (FEATURE_AVX10_2)) + cpu = "novalake"; /* Assume Granite Rapids D. */ else if (has_feature (FEATURE_AMX_COMPLEX)) cpu = "graniterapids-d"; @@ -643,9 +646,6 @@ const char *host_detect_local_cpu (int argc, const char **argv) /* Assume Clearwater Forest. */ if (has_feature (FEATURE_USER_MSR)) cpu = "clearwaterforest"; - /* Assume Nova Lake. */ - else if (has_feature (FEATURE_PREFETCHI)) - cpu = "novalake"; else if (has_feature (FEATURE_SM3)) { if (has_feature (FEATURE_KL)) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index a1f1b26..438fa4e 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -4159,12 +4159,18 @@ static bool ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1, rtx if_true, rtx if_false) { - machine_mode mode; + machine_mode mode = GET_MODE (dest); bool is_min; rtx tmp; if (code == LT) ; + else if (code == LE && !HONOR_NANS (mode)) + { + /* We can swap LE to GE and then invert to LT. */ + std::swap (cmp_op0, cmp_op1); + std::swap (if_true, if_false); + } else if (code == UNGE) std::swap (if_true, if_false); else @@ -4177,7 +4183,6 @@ ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0, else return false; - mode = GET_MODE (dest); if (immediate_operand (if_false, mode)) if_false = force_reg (mode, if_false); if (immediate_operand (if_true, mode)) @@ -9995,6 +10000,754 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp, return true; } +/* Fully unroll memmove of known size with up to 8 registers. */ + +static bool +ix86_expand_unroll_movmem (rtx dst, rtx src, rtx destreg, rtx srcreg, + unsigned HOST_WIDE_INT count, + machine_mode mode) +{ + /* If 8 registers registers can cover all memory, load them into + registers and store them together to avoid possible address + overlap between source and destination. */ + unsigned HOST_WIDE_INT moves = count / GET_MODE_SIZE (mode); + if (moves == 0) + { + mode = smallest_int_mode_for_size + (count * BITS_PER_UNIT).require (); + if (count == GET_MODE_SIZE (mode)) + moves = 1; + else + { + /* Reduce the smallest move size by half so that MOVES == 1. */ + mode = smallest_int_mode_for_size + (GET_MODE_BITSIZE (mode) / 2).require (); + moves = count / GET_MODE_SIZE (mode); + gcc_assert (moves == 1); + } + } + else if (moves > 8) + return false; + + unsigned int i; + rtx tmp[9]; + + for (i = 0; i < moves; i++) + tmp[i] = gen_reg_rtx (mode); + + rtx srcmem = change_address (src, mode, srcreg); + for (i = 0; i < moves; i++) + { + emit_move_insn (tmp[i], srcmem); + srcmem = offset_address (srcmem, + GEN_INT (GET_MODE_SIZE (mode)), + GET_MODE_SIZE (mode)); + } + + unsigned int epilogue_size = count & (GET_MODE_SIZE (mode) - 1); + machine_mode epilogue_mode = VOIDmode; + if (epilogue_size) + { + /* Handle the remaining bytes with overlapping move. */ + epilogue_mode = smallest_int_mode_for_size + (epilogue_size * BITS_PER_UNIT).require (); + tmp[8] = gen_reg_rtx (epilogue_mode); + srcmem = adjust_address (srcmem, epilogue_mode, 0); + srcmem = offset_address (srcmem, GEN_INT (epilogue_size), 1); + srcmem = offset_address (srcmem, + GEN_INT (-GET_MODE_SIZE (epilogue_mode)), + GET_MODE_SIZE (epilogue_mode)); + emit_move_insn (tmp[8], srcmem); + } + + rtx destmem = change_address (dst, mode, destreg); + for (i = 0; i < moves; i++) + { + emit_move_insn (destmem, tmp[i]); + destmem = offset_address (destmem, + GEN_INT (GET_MODE_SIZE (mode)), + GET_MODE_SIZE (mode)); + } + + if (epilogue_size) + { + /* Use overlapping move. */ + destmem = adjust_address (destmem, epilogue_mode, 0); + destmem = offset_address (destmem, GEN_INT (epilogue_size), 1); + destmem = offset_address (destmem, + GEN_INT (-GET_MODE_SIZE (epilogue_mode)), + GET_MODE_SIZE (epilogue_mode)); + emit_move_insn (destmem, tmp[8]); + } + + return true; +} + +/* Expand memmove of size with MOVES * mode size and MOVES <= 4. If + FORWARD is true, copy forward. Otherwise copy backward. */ + +static void +ix86_expand_n_move_movmem (rtx destmem, rtx srcmem, machine_mode mode, + unsigned int moves, bool forward) +{ + gcc_assert (moves <= 4); + + unsigned int i; + rtx tmp[8]; + + for (i = 0; i < moves; i++) + tmp[i] = gen_reg_rtx (mode); + + rtx step; + if (forward) + step = GEN_INT (GET_MODE_SIZE (mode)); + else + step = GEN_INT (-GET_MODE_SIZE (mode)); + + /* Load MOVES. */ + for (i = 0; i < moves - 1; i++) + { + emit_move_insn (tmp[i], srcmem); + srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode)); + } + emit_move_insn (tmp[i], srcmem); + + /* Store MOVES. */ + for (i = 0; i < moves - 1; i++) + { + emit_move_insn (destmem, tmp[i]); + destmem = offset_address (destmem, step, GET_MODE_SIZE (mode)); + } + emit_move_insn (destmem, tmp[i]); +} + +/* Load MOVES of mode size into REGS. If LAST is true, load the + last MOVES. Otherwise, load the first MOVES. */ + +static void +ix86_expand_load_movmem (rtx src, rtx srcreg, rtx count_exp, + machine_mode mode, unsigned int moves, + rtx regs[], bool last) +{ + unsigned int i; + + for (i = 0; i < moves; i++) + regs[i] = gen_reg_rtx (mode); + + rtx srcmem = change_address (src, mode, srcreg); + rtx step; + if (last) + { + srcmem = offset_address (srcmem, count_exp, 1); + step = GEN_INT (-GET_MODE_SIZE (mode)); + srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode)); + } + else + step = GEN_INT (GET_MODE_SIZE (mode)); + + for (i = 0; i < moves - 1; i++) + { + emit_move_insn (regs[i], srcmem); + srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode)); + } + emit_move_insn (regs[i], srcmem); +} + +/* Store MOVES of mode size into REGS. If LAST is true, store the + last MOVES. Otherwise, store the first MOVES. */ + +static void +ix86_expand_store_movmem (rtx dst, rtx destreg, rtx count_exp, + machine_mode mode, unsigned int moves, + rtx regs[], bool last) +{ + unsigned int i; + + rtx destmem = change_address (dst, mode, destreg); + rtx step; + if (last) + { + destmem = offset_address (destmem, count_exp, 1); + step = GEN_INT (-GET_MODE_SIZE (mode)); + destmem = offset_address (destmem, step, GET_MODE_SIZE (mode)); + } + else + step = GEN_INT (GET_MODE_SIZE (mode)); + + for (i = 0; i < moves - 1; i++) + { + emit_move_insn (destmem, regs[i]); + destmem = offset_address (destmem, step, GET_MODE_SIZE (mode)); + } + emit_move_insn (destmem, regs[i]); +} + +/* Expand memmove of size between (MOVES / 2) * mode size and + MOVES * mode size with overlapping load and store. MOVES is even. + MOVES >= 2 and MOVES <= 8. */ + +static void +ix86_expand_n_overlapping_move_movmem (rtx dst, rtx src, rtx destreg, + rtx srcreg, rtx count_exp, + machine_mode mode, + unsigned int moves) +{ + gcc_assert (moves >= 2 && moves <= 8 && (moves & 1) == 0); + + unsigned int half_moves = moves / 2; + unsigned int i, j; + rtx tmp[8]; + + for (i = 0; i < moves; i++) + tmp[i] = gen_reg_rtx (mode); + + rtx base_srcmem = change_address (src, mode, srcreg); + + /* Load the first half. */ + rtx srcmem = base_srcmem; + for (i = 0; i < half_moves - 1; i++) + { + emit_move_insn (tmp[i], srcmem); + srcmem = offset_address (srcmem, + GEN_INT (GET_MODE_SIZE (mode)), + GET_MODE_SIZE (mode)); + } + emit_move_insn (tmp[i], srcmem); + + /* Load the second half. */ + srcmem = offset_address (base_srcmem, count_exp, 1); + srcmem = offset_address (srcmem, + GEN_INT (-GET_MODE_SIZE (mode)), + GET_MODE_SIZE (mode)); + for (j = half_moves, i = 0; i < half_moves - 1; i++, j++) + { + emit_move_insn (tmp[j], srcmem); + srcmem = offset_address (srcmem, + GEN_INT (-GET_MODE_SIZE (mode)), + GET_MODE_SIZE (mode)); + } + emit_move_insn (tmp[j], srcmem); + + rtx base_destmem = change_address (dst, mode, destreg); + + /* Store the first half. */ + rtx destmem = base_destmem; + for (i = 0; i < half_moves - 1; i++) + { + emit_move_insn (destmem, tmp[i]); + destmem = offset_address (destmem, + GEN_INT (GET_MODE_SIZE (mode)), + GET_MODE_SIZE (mode)); + } + emit_move_insn (destmem, tmp[i]); + + /* Store the second half. */ + destmem = offset_address (base_destmem, count_exp, 1); + destmem = offset_address (destmem, GEN_INT (-GET_MODE_SIZE (mode)), + GET_MODE_SIZE (mode)); + for (j = half_moves, i = 0; i < half_moves - 1; i++, j++) + { + emit_move_insn (destmem, tmp[j]); + destmem = offset_address (destmem, GEN_INT (-GET_MODE_SIZE (mode)), + GET_MODE_SIZE (mode)); + } + emit_move_insn (destmem, tmp[j]); +} + +/* Expand memmove of size < mode size which is <= 64. */ + +static void +ix86_expand_less_move_movmem (rtx dst, rtx src, rtx destreg, + rtx srcreg, rtx count_exp, + unsigned HOST_WIDE_INT min_size, + machine_mode mode, + rtx_code_label *done_label) +{ + bool skip = false; + machine_mode count_mode = counter_mode (count_exp); + + rtx_code_label *between_32_63_label + = GET_MODE_SIZE (mode) > 32 ? gen_label_rtx () : nullptr; + /* Jump to BETWEEN_32_64_LABEL if size >= 32 and size < 64. */ + if (between_32_63_label) + { + if (min_size && min_size >= 32) + { + emit_jump_insn (gen_jump (between_32_63_label)); + emit_barrier (); + skip = true; + } + else + emit_cmp_and_jump_insns (count_exp, GEN_INT (32), GEU, + nullptr, count_mode, 1, + between_32_63_label); + } + + rtx_code_label *between_16_31_label + = (!skip && GET_MODE_SIZE (mode) > 16) ? gen_label_rtx () : nullptr; + /* Jump to BETWEEN_16_31_LABEL if size >= 16 and size < 31. */ + if (between_16_31_label) + { + if (min_size && min_size >= 16) + { + emit_jump_insn (gen_jump (between_16_31_label)); + emit_barrier (); + skip = true; + } + else + emit_cmp_and_jump_insns (count_exp, GEN_INT (16), GEU, + nullptr, count_mode, 1, + between_16_31_label); + } + + rtx_code_label *between_8_15_label + = (!skip && GET_MODE_SIZE (mode) > 8) ? gen_label_rtx () : nullptr; + /* Jump to BETWEEN_8_15_LABEL if size >= 8 and size < 15. */ + if (between_8_15_label) + { + if (min_size && min_size >= 8) + { + emit_jump_insn (gen_jump (between_8_15_label)); + emit_barrier (); + skip = true; + } + else + emit_cmp_and_jump_insns (count_exp, GEN_INT (8), GEU, + nullptr, count_mode, 1, + between_8_15_label); + } + + rtx_code_label *between_4_7_label + = (!skip && GET_MODE_SIZE (mode) > 4) ? gen_label_rtx () : nullptr; + /* Jump to BETWEEN_4_7_LABEL if size >= 4 and size < 7. */ + if (between_4_7_label) + { + if (min_size && min_size >= 4) + { + emit_jump_insn (gen_jump (between_4_7_label)); + emit_barrier (); + skip = true; + } + else + emit_cmp_and_jump_insns (count_exp, GEN_INT (4), GEU, + nullptr, count_mode, 1, + between_4_7_label); + } + + rtx_code_label *between_2_3_label + = (!skip && GET_MODE_SIZE (mode) > 2) ? gen_label_rtx () : nullptr; + /* Jump to BETWEEN_2_3_LABEL if size >= 2 and size < 3. */ + if (between_2_3_label) + { + if (min_size && min_size >= 2) + { + emit_jump_insn (gen_jump (between_2_3_label)); + emit_barrier (); + skip = true; + } + else + emit_cmp_and_jump_insns (count_exp, GEN_INT (1), GT, + nullptr, count_mode, 1, + between_2_3_label); + } + + if (!skip) + { + rtx_code_label *zero_label + = min_size == 0 ? gen_label_rtx () : nullptr; + /* Skip if size == 0. */ + if (zero_label) + emit_cmp_and_jump_insns (count_exp, GEN_INT (1), LT, + nullptr, count_mode, 1, + zero_label, + profile_probability::unlikely ()); + + /* Move 1 byte. */ + rtx tmp0 = gen_reg_rtx (QImode); + rtx srcmem = change_address (src, QImode, srcreg); + emit_move_insn (tmp0, srcmem); + rtx destmem = change_address (dst, QImode, destreg); + emit_move_insn (destmem, tmp0); + + if (zero_label) + emit_label (zero_label); + + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } + + if (between_32_63_label) + { + emit_label (between_32_63_label); + ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg, + count_exp, OImode, 2); + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } + + if (between_16_31_label) + { + emit_label (between_16_31_label); + ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg, + count_exp, TImode, 2); + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } + + if (between_8_15_label) + { + emit_label (between_8_15_label); + ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg, + count_exp, DImode, 2); + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } + + if (between_4_7_label) + { + emit_label (between_4_7_label); + ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg, + count_exp, SImode, 2); + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } + + if (between_2_3_label) + { + emit_label (between_2_3_label); + ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg, + count_exp, HImode, 2); + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } +} + +/* Expand movmem with overlapping unaligned loads and stores: + 1. Load all sources into registers and store them together to avoid + possible address overlap between source and destination. + 2. For known size, first try to fully unroll with 8 registers. + 3. For size <= 2 * MOVE_MAX, load all sources into 2 registers first + and then store them together. + 4. For size > 2 * MOVE_MAX and size <= 4 * MOVE_MAX, load all sources + into 4 registers first and then store them together. + 5. For size > 4 * MOVE_MAX and size <= 8 * MOVE_MAX, load all sources + into 8 registers first and then store them together. + 6. For size > 8 * MOVE_MAX, + a. If address of destination > address of source, copy backward + with a 4 * MOVE_MAX loop with unaligned loads and stores. Load + the first 4 * MOVE_MAX into 4 registers before the loop and + store them after the loop to support overlapping addresses. + b. Otherwise, copy forward with a 4 * MOVE_MAX loop with unaligned + loads and stores. Load the last 4 * MOVE_MAX into 4 registers + before the loop and store them after the loop to support + overlapping addresses. + */ + +bool +ix86_expand_movmem (rtx operands[]) +{ + /* Since there are much less registers available in 32-bit mode, don't + inline movmem in 32-bit mode. */ + if (!TARGET_64BIT) + return false; + + rtx dst = operands[0]; + rtx src = operands[1]; + rtx count_exp = operands[2]; + rtx expected_size_exp = operands[5]; + rtx min_size_exp = operands[6]; + rtx probable_max_size_exp = operands[8]; + unsigned HOST_WIDE_INT count = HOST_WIDE_INT_0U; + HOST_WIDE_INT expected_size = HOST_WIDE_INT_M1U; + unsigned HOST_WIDE_INT min_size = HOST_WIDE_INT_0U; + unsigned HOST_WIDE_INT probable_max_size = HOST_WIDE_INT_M1U; + + if (CONST_INT_P (count_exp)) + { + min_size = probable_max_size = count = expected_size + = INTVAL (count_exp); + /* When COUNT is 0, there is nothing to do. */ + if (!count) + return true; + } + else + { + if (min_size_exp) + min_size = INTVAL (min_size_exp); + if (probable_max_size_exp) + probable_max_size = INTVAL (probable_max_size_exp); + if (CONST_INT_P (expected_size_exp)) + expected_size = INTVAL (expected_size_exp); + } + + /* Make sure we don't need to care about overflow later on. */ + if (count > (HOST_WIDE_INT_1U << 30)) + return false; + + addr_space_t dst_as = MEM_ADDR_SPACE (dst); + addr_space_t src_as = MEM_ADDR_SPACE (src); + int dynamic_check; + bool noalign; + enum stringop_alg alg = decide_alg (count, expected_size, min_size, + probable_max_size, false, false, + dst_as, src_as, &dynamic_check, + &noalign, false); + if (alg == libcall) + return false; + + rtx destreg = ix86_copy_addr_to_reg (XEXP (dst, 0)); + rtx srcreg = ix86_copy_addr_to_reg (XEXP (src, 0)); + + unsigned int move_max = MOVE_MAX; + machine_mode mode = smallest_int_mode_for_size + (move_max * BITS_PER_UNIT).require (); + if (probable_max_size && probable_max_size < move_max) + { + /* Get a usable MOVE_MAX. */ + mode = smallest_int_mode_for_size + (probable_max_size * BITS_PER_UNIT).require (); + /* Reduce MOVE_MAX by half so that MOVE_MAX can be used. */ + if (GET_MODE_SIZE (mode) > probable_max_size) + mode = smallest_int_mode_for_size + (GET_MODE_BITSIZE (mode) / 2).require (); + move_max = GET_MODE_SIZE (mode); + } + + /* Try to fully unroll memmove of known size first. */ + if (count + && ix86_expand_unroll_movmem (dst, src, destreg, srcreg, count, + mode)) + return true; + + rtx_code_label *done_label = gen_label_rtx (); + + rtx_code_label *less_vec_label = nullptr; + if (min_size == 0 || min_size < move_max) + less_vec_label = gen_label_rtx (); + + machine_mode count_mode = counter_mode (count_exp); + + /* Jump to LESS_VEC_LABEL if size < MOVE_MAX. */ + if (less_vec_label) + emit_cmp_and_jump_insns (count_exp, GEN_INT (move_max), LTU, + nullptr, count_mode, 1, + less_vec_label); + + rtx_code_label *more_2x_vec_label = nullptr; + if (probable_max_size == 0 || probable_max_size > 2 * move_max) + more_2x_vec_label = gen_label_rtx (); + + /* Jump to MORE_2X_VEC_LABEL if size > 2 * MOVE_MAX. */ + if (more_2x_vec_label) + emit_cmp_and_jump_insns (count_exp, GEN_INT (2 * move_max), GTU, + nullptr, count_mode, 1, + more_2x_vec_label); + + if (min_size == 0 || min_size <= 2 * move_max) + { + /* Size >= MOVE_MAX and size <= 2 * MOVE_MAX. */ + ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg, + count_exp, mode, 2); + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } + + if (less_vec_label) + { + /* Size < MOVE_MAX. */ + emit_label (less_vec_label); + ix86_expand_less_move_movmem (dst, src, destreg, srcreg, + count_exp, min_size, mode, + done_label); + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } + + if (more_2x_vec_label) + { + /* Size > 2 * MOVE_MAX and destination may overlap with source. */ + emit_label (more_2x_vec_label); + + rtx_code_label *more_8x_vec_label = nullptr; + if (probable_max_size == 0 || probable_max_size > 8 * move_max) + more_8x_vec_label = gen_label_rtx (); + + /* Jump to MORE_8X_VEC_LABEL if size > 8 * MOVE_MAX. */ + if (more_8x_vec_label) + emit_cmp_and_jump_insns (count_exp, GEN_INT (8 * move_max), GTU, + nullptr, count_mode, 1, + more_8x_vec_label); + + rtx_code_label *last_4x_vec_label = nullptr; + if (min_size == 0 || min_size < 4 * move_max) + last_4x_vec_label = gen_label_rtx (); + + /* Jump to LAST_4X_VEC_LABEL if size < 4 * MOVE_MAX. */ + if (last_4x_vec_label) + emit_cmp_and_jump_insns (count_exp, GEN_INT (4 * move_max), LTU, + nullptr, count_mode, 1, + last_4x_vec_label); + + if (probable_max_size == 0 || probable_max_size > 4 * move_max) + { + /* Size > 4 * MOVE_MAX and size <= 8 * MOVE_MAX. */ + ix86_expand_n_overlapping_move_movmem (dst, src, destreg, + srcreg, count_exp, + mode, 8); + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } + + if (last_4x_vec_label) + { + /* Size > 2 * MOVE_MAX and size <= 4 * MOVE_MAX. */ + emit_label (last_4x_vec_label); + ix86_expand_n_overlapping_move_movmem (dst, src, destreg, + srcreg, count_exp, + mode, 4); + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } + + if (more_8x_vec_label) + { + /* Size > 8 * MOVE_MAX. */ + emit_label (more_8x_vec_label); + + rtx loop_count = gen_reg_rtx (count_mode); + emit_move_insn (loop_count, count_exp); + + /* Jump to MORE_8X_VEC_BACKWARD_LABEL if source address is + lower than destination address. */ + rtx_code_label *more_8x_vec_backward_label = gen_label_rtx (); + emit_cmp_and_jump_insns (srcreg, destreg, LTU, nullptr, + GET_MODE (destreg), 1, + more_8x_vec_backward_label); + + /* Skip if source == destination which is less common. */ + emit_cmp_and_jump_insns (srcreg, destreg, EQ, nullptr, + GET_MODE (destreg), 1, done_label, + profile_probability::unlikely ()); + + rtx base_destreg = gen_reg_rtx (GET_MODE (destreg)); + emit_move_insn (base_destreg, destreg); + + /* Load the last 4 * MOVE_MAX. */ + rtx regs[4]; + ix86_expand_load_movmem (src, srcreg, count_exp, mode, + ARRAY_SIZE (regs), regs, true); + + rtx srcmem = change_address (src, mode, srcreg); + rtx destmem = change_address (dst, mode, destreg); + + /* Copy forward with a 4 * MOVE_MAX loop. */ + rtx_code_label *loop_4x_vec_forward_label = gen_label_rtx (); + emit_label (loop_4x_vec_forward_label); + + ix86_expand_n_move_movmem (destmem, srcmem, mode, 4, true); + + rtx tmp; + rtx delta = GEN_INT (4 * MOVE_MAX); + + /* Decrement LOOP_COUNT by 4 * MOVE_MAX. */ + tmp = expand_simple_binop (GET_MODE (loop_count), MINUS, + loop_count, delta, nullptr, 1, + OPTAB_DIRECT); + if (tmp != loop_count) + emit_move_insn (loop_count, tmp); + + /* Increment DESTREG and SRCREG by 4 * MOVE_MAX. */ + tmp = expand_simple_binop (GET_MODE (destreg), PLUS, + destreg, delta, nullptr, 1, + OPTAB_DIRECT); + if (tmp != destreg) + emit_move_insn (destreg, tmp); + tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, srcreg, + delta, nullptr, 1, OPTAB_DIRECT); + if (tmp != srcreg) + emit_move_insn (srcreg, tmp); + + /* Stop if LOOP_EXP <= 4 * MOVE_MAX. */ + emit_cmp_and_jump_insns (loop_count, delta, GTU, nullptr, + GET_MODE (loop_count), 1, + loop_4x_vec_forward_label); + + /* Store the last 4 * MOVE_MAX. */ + ix86_expand_store_movmem (dst, base_destreg, count_exp, mode, + ARRAY_SIZE (regs), regs, true); + + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + + /* Copy backward with a 4 * MOVE_MAX loop. */ + emit_label (more_8x_vec_backward_label); + + base_destreg = gen_reg_rtx (GET_MODE (destreg)); + emit_move_insn (base_destreg, destreg); + + /* Load the first 4 * MOVE_MAX. */ + ix86_expand_load_movmem (src, srcreg, count_exp, mode, + ARRAY_SIZE (regs), regs, false); + + /* Increment DESTREG and SRCREG by COUNT_EXP. */ + tmp = expand_simple_binop (GET_MODE (destreg), PLUS, + destreg, count_exp, nullptr, 1, + OPTAB_DIRECT); + if (tmp != destreg) + emit_move_insn (destreg, tmp); + tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, srcreg, + count_exp, nullptr, 1, OPTAB_DIRECT); + if (tmp != srcreg) + emit_move_insn (srcreg, tmp); + + srcmem = change_address (src, mode, srcreg); + destmem = change_address (dst, mode, destreg); + rtx step = GEN_INT (-GET_MODE_SIZE (mode)); + srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode)); + destmem = offset_address (destmem, step, GET_MODE_SIZE (mode)); + + rtx_code_label *loop_4x_vec_backward_label = gen_label_rtx (); + emit_label (loop_4x_vec_backward_label); + + ix86_expand_n_move_movmem (destmem, srcmem, mode, 4, false); + + /* Decrement LOOP_COUNT by 4 * MOVE_MAX. */ + tmp = expand_simple_binop (GET_MODE (loop_count), MINUS, + loop_count, delta, nullptr, 1, + OPTAB_DIRECT); + if (tmp != loop_count) + emit_move_insn (loop_count, tmp); + + /* Decrement DESTREG and SRCREG by 4 * MOVE_MAX. */ + tmp = expand_simple_binop (GET_MODE (destreg), MINUS, + destreg, delta, nullptr, 1, + OPTAB_DIRECT); + if (tmp != destreg) + emit_move_insn (destreg, tmp); + tmp = expand_simple_binop (GET_MODE (srcreg), MINUS, srcreg, + delta, nullptr, 1, OPTAB_DIRECT); + if (tmp != srcreg) + emit_move_insn (srcreg, tmp); + + /* Stop if LOOP_EXP <= 4 * MOVE_MAX. */ + emit_cmp_and_jump_insns (loop_count, delta, GTU, nullptr, + GET_MODE (loop_count), 1, + loop_4x_vec_backward_label); + + /* Store the first 4 * MOVE_MAX. */ + ix86_expand_store_movmem (dst, base_destreg, count_exp, mode, + ARRAY_SIZE (regs), regs, false); + + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } + } + + emit_label (done_label); + + return true; +} + /* Expand cmpstrn or memcmp. */ bool @@ -26377,17 +27130,15 @@ ix86_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, struct expand_operand ops[5]; int dfv; - push_to_sequence (*prep_seq); - expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL); - - cmp_mode = op_mode = GET_MODE (op0); + /* Exit early for non integer modes to avoid O(n^2) part of expand_operands. */ + cmp_mode = op_mode = TYPE_MODE (TREE_TYPE (treeop0)); if (!(op_mode == DImode || op_mode == SImode || op_mode == HImode || op_mode == QImode)) - { - end_sequence (); - return NULL_RTX; - } + return NULL_RTX; + + push_to_sequence (*prep_seq); + expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL); icode = code_for_ccmp (op_mode); diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index 8e27784..ce6f40b 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -3947,12 +3947,20 @@ ix86_emit_tls_call (rtx tls_set, x86_cse_kind kind, basic_block bb, (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG) (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil)) + or a basic block with only deleted instructions: + + (code_label 348 23 349 45 3 (nil) [0 uses]) + (note 349 348 436 45 [bb 45] NOTE_INSN_BASIC_BLOCK) + (note 436 349 362 45 NOTE_INSN_DELETED) + */ gcc_assert (DEBUG_INSN_P (insn) || (NOTE_P (insn) && ((NOTE_KIND (insn) == NOTE_INSN_FUNCTION_BEG) || (NOTE_KIND (insn) + == NOTE_INSN_DELETED) + || (NOTE_KIND (insn) == NOTE_INSN_BASIC_BLOCK)))); insn = NULL; break; @@ -4810,6 +4818,9 @@ pass_x86_cse::x86_cse (void) df_process_deferred_rescans (); } + FOR_EACH_VEC_ELT (loads, i, load) + delete load; + df_clear_flags (DF_DEFER_INSN_RESCAN); timevar_pop (TV_MACH_DEP); diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc index dadcf76..35064d8 100644 --- a/gcc/config/i386/i386-options.cc +++ b/gcc/config/i386/i386-options.cc @@ -1837,6 +1837,21 @@ set_ix86_tune_features (struct gcc_options *opts, } parse_mtune_ctrl_str (opts, dump); + + /* mgather/mscatter option would overwrite -mtune-crtl option. */ + if (OPTION_SET_P (ix86_use_gather)) + { + ix86_tune_features[X86_TUNE_USE_GATHER_2PARTS] = ix86_use_gather; + ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS] = ix86_use_gather; + ix86_tune_features[X86_TUNE_USE_GATHER_8PARTS] = ix86_use_gather; + } + + if (OPTION_SET_P (ix86_use_scatter)) + { + ix86_tune_features[X86_TUNE_USE_SCATTER_2PARTS] = ix86_use_scatter; + ix86_tune_features[X86_TUNE_USE_SCATTER_4PARTS] = ix86_use_scatter; + ix86_tune_features[X86_TUNE_USE_SCATTER_8PARTS] = ix86_use_scatter; + } } @@ -2917,7 +2932,7 @@ ix86_option_override_internal (bool main_args_p, else { opts->x_ix86_move_max = opts->x_prefer_vector_width_type; - if (opts_set->x_ix86_move_max == PVW_NONE) + if (opts->x_ix86_move_max == PVW_NONE) { if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)) opts->x_ix86_move_max = PVW_AVX512; diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index bdb8bb9..5ff414a 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -78,6 +78,7 @@ extern void substitute_vpternlog_operands (rtx[]); extern bool ix86_expand_strlen (rtx, rtx, rtx, rtx); extern bool ix86_expand_set_or_cpymem (rtx, rtx, rtx, rtx, rtx, rtx, rtx, rtx, rtx, rtx, bool); +extern bool ix86_expand_movmem (rtx[]); extern bool ix86_expand_cmpstrn_or_cmpmem (rtx, rtx, rtx, rtx, rtx, bool); extern enum reg_class ix86_insn_base_reg_class (rtx_insn *); diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 587b2bd..75a9cb6 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -598,6 +598,20 @@ ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1, } } + /* SUB (a, b) underflows precisely when a < b. Convert + (compare (minus (a b)) a) to (compare (a b)) + to match *sub<mode>_3 pattern. */ + if (!op0_preserve_value + && (*code == GTU || *code == LEU) + && GET_CODE (*op0) == MINUS + && rtx_equal_p (XEXP (*op0, 0), *op1)) + { + *op1 = XEXP (*op0, 1); + *op0 = XEXP (*op0, 0); + *code = (int) swap_condition ((enum rtx_code) *code); + return; + } + /* Swap operands of GTU comparison to canonicalize addcarry/subborrow comparison. */ if (!op0_preserve_value @@ -23753,9 +23767,15 @@ x86_print_call_or_nop (FILE *file, const char *target, const char *label) { if (flag_nop_mcount || !strcmp (target, "nop")) - /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */ - fprintf (file, "%s" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n", - label); + { + if (TARGET_16BIT) + /* 3 byte no-op: lea 0(%si), %si */ + fprintf (file, "%s" ASM_BYTE "0x8d, 0x74, 0x00\n", label); + else + /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */ + fprintf (file, "%s" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n", + label); + } else if (!TARGET_PECOFF && flag_pic) { gcc_assert (flag_plt); @@ -25089,7 +25109,7 @@ i386_solaris_elf_named_section (const char *name, unsigned int flags, return; } -#ifndef USE_GAS +#if !HAVE_GNU_AS if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE) { solaris_elf_asm_comdat_section (name, flags, decl); @@ -26377,7 +26397,20 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, (TREE_OPERAND (gimple_assign_rhs1 (def), 0)))))) { if (fp) - m_num_sse_needed[where]++; + { + /* Scalar FP values residing in x87 registers need to be + spilled and reloaded. */ + auto mode2 = TYPE_MODE (TREE_TYPE (op)); + if (IS_STACK_MODE (mode2)) + { + int cost + = (ix86_cost->hard_register.fp_store[mode2 == SFmode + ? 0 : 1] + + ix86_cost->sse_load[sse_store_index (mode2)]); + stmt_cost += COSTS_N_INSNS (cost) / 2; + } + m_num_sse_needed[where]++; + } else { m_num_gpr_needed[where]++; @@ -26595,6 +26628,11 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs) if (loop_vinfo && !LOOP_VINFO_EPILOGUE_P (loop_vinfo) && LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant () > 2 + /* Avoid a masked epilog if cascaded epilogues eventually get us + to one with VF 1 as that means no scalar epilog at all. */ + && !((GET_MODE_SIZE (loop_vinfo->vector_mode) + / LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant () == 16) + && ix86_tune_features[X86_TUNE_AVX512_TWO_EPILOGUES]) && ix86_tune_features[X86_TUNE_AVX512_MASKED_EPILOGUES] && !OPTION_SET_P (param_vect_partial_vector_usage)) { diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 94f335f..b934117 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -2488,7 +2488,11 @@ constexpr wide_int_bitmask PTA_DIAMONDRAPIDS = PTA_GRANITERAPIDS_D | PTA_CMPCCXADD | PTA_SHA512 | PTA_SM3 | PTA_SM4 | PTA_AVX10_2 | PTA_APX_F | PTA_AMX_AVX512 | PTA_AMX_FP8 | PTA_AMX_TF32 | PTA_MOVRS | PTA_AMX_MOVRS; -constexpr wide_int_bitmask PTA_NOVALAKE = PTA_PANTHERLAKE | PTA_PREFETCHI; +constexpr wide_int_bitmask PTA_NOVALAKE = PTA_PANTHERLAKE | PTA_PREFETCHI + | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL | PTA_AVX512BW | PTA_AVX512DQ + | PTA_AVX512VBMI | PTA_AVX512IFMA | PTA_AVX512VNNI | PTA_AVX512VBMI2 + | PTA_AVX512BITALG | PTA_AVX512VPOPCNTDQ | PTA_AVX512FP16 | PTA_AVX512BF16 + | PTA_AVX10_1 | PTA_AVX10_2 | PTA_APX_F | PTA_MOVRS; constexpr wide_int_bitmask PTA_BDVER1 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_POPCNT | PTA_LZCNT diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 218377a..df7135f 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -8642,7 +8642,7 @@ [(set (reg FLAGS_REG) (compare (match_operand:SWI 1 "nonimmediate_operand" "0,0,rm,r") (match_operand:SWI 2 "<general_operand>" "<r><i>,<m>,r<i>,<m>"))) - (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m,<r>i,r,r") + (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m,<r>,r,r") (minus:SWI (match_dup 1) (match_dup 2)))] "ix86_match_ccmode (insn, CCmode) && ix86_binary_operator_ok (MINUS, <MODE>mode, operands, TARGET_APX_NDD)" @@ -8860,6 +8860,35 @@ (match_dup 0))) (clobber (reg:CC FLAGS_REG))])]) +(define_insn "*add<mode>3_carry_2" + [(set (reg FLAGS_REG) + (compare + (plus:SWI + (plus:SWI + (match_operator:SWI 4 "ix86_carry_flag_operator" + [(match_operand 3 "flags_reg_operand") (const_int 0)]) + (match_operand:SWI 1 "nonimmediate_operand" "%0,0,rm,r")) + (match_operand:SWI 2 "<general_operand>" "<r><i>,<m>,r<i>,<m>")) + (const_int 0))) + (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m,<r>,r,r") + (plus:SWI + (plus:SWI + (match_op_dup 4 [(match_dup 3) (const_int 0)]) + (match_dup 1)) + (match_dup 2)))] + "ix86_match_ccmode (insn, CCGOCmode) + && ix86_binary_operator_ok (PLUS, <MODE>mode, operands, TARGET_APX_NDD)" + "@ + adc{<imodesuffix>}\t{%2, %0|%0, %2} + adc{<imodesuffix>}\t{%2, %0|%0, %2} + adc{<imodesuffix>}\t{%2, %1, %0|%0, %1, %2} + adc{<imodesuffix>}\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "*,*,apx_ndd,apx_ndd") + (set_attr "type" "alu") + (set_attr "use_carry" "1") + (set_attr "pent_pair" "pu") + (set_attr "mode" "<MODE>")]) + (define_insn "*add<mode>3_carry_0" [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m") (plus:SWI @@ -8874,6 +8903,26 @@ (set_attr "pent_pair" "pu") (set_attr "mode" "<MODE>")]) +(define_insn "*add<mode>3_carry_0_cc" + [(set (reg FLAGS_REG) + (compare + (plus:SWI + (match_operator:SWI 2 "ix86_carry_flag_operator" + [(match_operand 3 "flags_reg_operand") (const_int 0)]) + (match_operand:SWI 1 "nonimmediate_operand" "0")) + (const_int 0))) + (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m") + (plus:SWI + (match_op_dup 2 [(match_dup 3) (const_int 0)]) + (match_dup 1)))] + "ix86_match_ccmode (insn, CCGOCmode) + && (!MEM_P (operands[0]) || rtx_equal_p (operands[0], operands[1]))" + "adc{<imodesuffix>}\t{$0, %0|%0, 0}" + [(set_attr "type" "alu") + (set_attr "use_carry" "1") + (set_attr "pent_pair" "pu") + (set_attr "mode" "<MODE>")]) + (define_insn "*add<mode>3_carry_0r" [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m") (plus:SWI @@ -8888,6 +8937,26 @@ (set_attr "pent_pair" "pu") (set_attr "mode" "<MODE>")]) +(define_insn "*add<mode>3_carry_0r_cc" + [(set (reg FLAGS_REG) + (compare + (plus:SWI + (match_operator:SWI 2 "ix86_carry_flag_unset_operator" + [(match_operand 3 "flags_reg_operand") (const_int 0)]) + (match_operand:SWI 1 "nonimmediate_operand" "0")) + (const_int 0))) + (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m") + (plus:SWI + (match_op_dup 2 [(match_dup 3) (const_int 0)]) + (match_dup 1)))] + "ix86_match_ccmode (insn, CCGOCmode) + && (!MEM_P (operands[0]) || rtx_equal_p (operands[0], operands[1]))" + "sbb{<imodesuffix>}\t{$-1, %0|%0, -1}" + [(set_attr "type" "alu") + (set_attr "use_carry" "1") + (set_attr "pent_pair" "pu") + (set_attr "mode" "<MODE>")]) + (define_insn "*addqi3_carry_zext<mode>" [(set (match_operand:SWI248x 0 "register_operand" "=r,r") (zero_extend:SWI248x @@ -9456,6 +9525,35 @@ (match_dup 0))) (clobber (reg:CC FLAGS_REG))])]) +(define_insn "*sub<mode>3_carry_2" + [(set (reg FLAGS_REG) + (compare + (minus:SWI + (minus:SWI + (match_operand:SWI 1 "nonimmediate_operand" "0,0,rm,r") + (match_operator:SWI 4 "ix86_carry_flag_operator" + [(match_operand 3 "flags_reg_operand") (const_int 0)])) + (match_operand:SWI 2 "<general_operand>" "<r><i>,<m>,r<i>,<m>")) + (const_int 0))) + (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m,<r>,r,r") + (minus:SWI + (minus:SWI + (match_dup 1) + (match_op_dup 4 [(match_dup 3) (const_int 0)])) + (match_dup 2)))] + "ix86_match_ccmode (insn, CCGOCmode) + && ix86_binary_operator_ok (MINUS, <MODE>mode, operands, TARGET_APX_NDD)" + "@ + sbb{<imodesuffix>}\t{%2, %0|%0, %2} + sbb{<imodesuffix>}\t{%2, %0|%0, %2} + sbb{<imodesuffix>}\t{%2, %1, %0|%0, %1, %2} + sbb{<imodesuffix>}\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "*,*,apx_ndd,apx_ndd") + (set_attr "type" "alu") + (set_attr "use_carry" "1") + (set_attr "pent_pair" "pu") + (set_attr "mode" "<MODE>")]) + (define_insn "*sub<mode>3_carry_0" [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m") (minus:SWI @@ -9470,6 +9568,26 @@ (set_attr "pent_pair" "pu") (set_attr "mode" "<MODE>")]) +(define_insn "*sub<mode>3_carry_0_cc" + [(set (reg FLAGS_REG) + (compare + (minus:SWI + (match_operand:SWI 1 "nonimmediate_operand" "0") + (match_operator:SWI 2 "ix86_carry_flag_operator" + [(match_operand 3 "flags_reg_operand") (const_int 0)])) + (const_int 0))) + (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m") + (minus:SWI + (match_dup 1) + (match_op_dup 2 [(match_dup 3) (const_int 0)])))] + "ix86_match_ccmode (insn, CCGOCmode) + && (!MEM_P (operands[0]) || rtx_equal_p (operands[0], operands[1]))" + "sbb{<imodesuffix>}\t{$0, %0|%0, 0}" + [(set_attr "type" "alu") + (set_attr "use_carry" "1") + (set_attr "pent_pair" "pu") + (set_attr "mode" "<MODE>")]) + (define_insn "*sub<mode>3_carry_0r" [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m") (minus:SWI @@ -9484,6 +9602,26 @@ (set_attr "pent_pair" "pu") (set_attr "mode" "<MODE>")]) +(define_insn "*sub<mode>3_carry_0r_cc" + [(set (reg FLAGS_REG) + (compare + (minus:SWI + (match_operand:SWI 1 "nonimmediate_operand" "0") + (match_operator:SWI 2 "ix86_carry_flag_unset_operator" + [(match_operand 3 "flags_reg_operand") (const_int 0)])) + (const_int 0))) + (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m") + (minus:SWI + (match_dup 1) + (match_op_dup 2 [(match_dup 3) (const_int 0)])))] + "ix86_match_ccmode (insn, CCGOCmode) + && (!MEM_P (operands[0]) || rtx_equal_p (operands[0], operands[1]))" + "adc{<imodesuffix>}\t{$-1, %0|%0, -1}" + [(set_attr "type" "alu") + (set_attr "use_carry" "1") + (set_attr "pent_pair" "pu") + (set_attr "mode" "<MODE>")]) + (define_insn "*subqi3_carry_zext<mode>" [(set (match_operand:SWI248x 0 "register_operand" "=r,r") (zero_extend:SWI248x @@ -12213,7 +12351,7 @@ (compare:CCNO (and:SWI48 (match_operand:SWI48 0 "nonimmediate_operand") - (match_operand:SWI48 1 "<nonmemory_szext_operand>")) + (match_operand:SWI48 1 "<general_szext_operand>")) (const_int 0)))]) (define_expand "testqi_ccz_1" @@ -12221,7 +12359,7 @@ (compare:CCZ (and:QI (match_operand:QI 0 "nonimmediate_operand") - (match_operand:QI 1 "nonmemory_operand")) + (match_operand:QI 1 "general_operand")) (const_int 0)))]) (define_insn "*testdi_1" @@ -12229,7 +12367,7 @@ (compare (and:DI (match_operand:DI 0 "nonimmediate_operand" "%r,rm") - (match_operand:DI 1 "x86_64_szext_nonmemory_operand" "Z,re")) + (match_operand:DI 1 "x86_64_szext_general_operand" "Z,re")) (const_int 0)))] "TARGET_64BIT && ix86_match_ccmode @@ -12242,7 +12380,8 @@ (satisfies_constraint_Z (operands[1]) && (!CONST_INT_P (operands[1]) || val_signbit_known_set_p (SImode, INTVAL (operands[1])))) - ? CCZmode : CCNOmode)" + ? CCZmode : CCNOmode) + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" "@ test{l}\t{%k1, %k0|%k0, %k1} test{q}\t{%1, %0|%0, %1}" @@ -12253,12 +12392,13 @@ [(set (reg FLAGS_REG) (compare (and:QI - (match_operand:QI 0 "nonimmediate_operand" "%qm,qm,r") - (match_operand:QI 1 "nonmemory_operand" "q,n,n")) + (match_operand:QI 0 "nonimmediate_operand" "%qm,*a,qm,r") + (match_operand:QI 1 "general_operand" "q,n,n,n")) (const_int 0)))] "ix86_match_ccmode (insn, CONST_INT_P (operands[1]) - && INTVAL (operands[1]) >= 0 ? CCNOmode : CCZmode)" + && INTVAL (operands[1]) >= 0 ? CCNOmode : CCZmode) + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" { if (get_attr_mode (insn) == MODE_SI) { @@ -12270,7 +12410,7 @@ } [(set_attr "type" "test") (set (attr "mode") - (cond [(eq_attr "alternative" "2") + (cond [(eq_attr "alternative" "3") (const_string "SI") (and (match_test "optimize_insn_for_size_p ()") (and (match_operand 0 "ext_QIreg_operand") @@ -12278,16 +12418,17 @@ (const_string "SI") ] (const_string "QI"))) - (set_attr "pent_pair" "uv,np,np")]) + (set_attr "pent_pair" "uv,uv,np,np")]) (define_insn "*test<mode>_1" [(set (reg FLAGS_REG) (compare (and:SWI124 (match_operand:SWI124 0 "nonimmediate_operand" "%<r>m,*a,<r>m") - (match_operand:SWI124 1 "<nonmemory_szext_operand>" "<r>,<i>,<i>")) - (const_int 0)))] - "ix86_match_ccmode (insn, CCNOmode)" + (match_operand:SWI124 1 "<general_operand>" "<r>,<i>,<i>")) + (const_int 0)))] + "ix86_match_ccmode (insn, CCNOmode) + && !(MEM_P (operands[0]) && MEM_P (operands[1]))" "test{<imodesuffix>}\t{%1, %0|%0, %1}" [(set_attr "type" "test") (set_attr "mode" "<MODE>") @@ -14062,6 +14203,22 @@ (set_attr "isa" "*,apx_ndd") (set_attr "mode" "SI")]) +;; It must be put before *<code><mode>_3, the one below. +(define_insn "*ior<mode>_ccz_1" + [(set (reg:CCZ FLAGS_REG) + (compare:CCZ + (ior:SWI1248_AVX512BWDQ_64 + (match_operand:SWI1248_AVX512BWDQ_64 1 "nonimmediate_operand" "%0,?k") + (match_operand:SWI1248_AVX512BWDQ_64 2 "<general_operand>" "<g>, k")) + (const_int 0))) + (clobber (match_scratch:SWI1248_AVX512BWDQ_64 0 "=<r>, X"))] + "TARGET_AVX512F && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "@ + or{<imodesuffix>}\t{%2, %0|%0, %2} + kortest<mskmodesuffix>\t{%1, %2|%2, %1}" + [(set_attr "type" "alu,msklog") + (set_attr "mode" "<MODE>")]) + (define_insn "*<code><mode>_3" [(set (reg FLAGS_REG) (compare (any_or:SWI @@ -25708,6 +25865,23 @@ (set_attr "length_immediate" "0") (set_attr "modrm" "0")]) +(define_expand "movmem<mode>" + [(use (match_operand:BLK 0 "memory_operand")) + (use (match_operand:BLK 1 "memory_operand")) + (use (match_operand:SWI48 2 "nonmemory_operand")) + (use (match_operand:SWI48 3 "const_int_operand")) + (use (match_operand:SI 4 "const_int_operand")) + (use (match_operand:SI 5 "const_int_operand")) + (use (match_operand:SI 6 "")) + (use (match_operand:SI 7 "")) + (use (match_operand:SI 8 ""))] + "" +{ + if (ix86_expand_movmem (operands)) + DONE; + FAIL; +}) + (define_expand "cpymem<mode>" [(use (match_operand:BLK 0 "memory_operand")) (use (match_operand:BLK 1 "memory_operand")) @@ -27353,6 +27527,72 @@ (match_dup 0))] "peep2_reg_dead_p (2, operands[0])" [(set (match_dup 2) (match_dup 1))]) + +;; umax (a, add (a, b)) => [sum, ovf] = add (a, b); ovf ? a : sum +;; umin (a, add (a, b)) => [sum, ovf] = add (a, b); ovf ? sum : a + +(define_code_attr ovf_add_cmp [(umax "geu") (umin "ltu")]) + +(define_int_iterator ovf_comm [1 2]) + +(define_insn_and_split "*plus_within_<code><mode>3_<ovf_comm>" + [(set (match_operand:SWI248 0 "register_operand") + (umaxmin:SWI248 + (plus:SWI248 (match_operand:SWI248 1 "nonimmediate_operand") + (match_operand:SWI248 2 "<general_operand>")) + (match_dup ovf_comm))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_CMOVE + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(parallel + [(set (reg:CCC FLAGS_REG) + (compare:CCC + (plus:SWI248 (match_dup 1) (match_dup 2)) + (match_dup ovf_comm))) + (set (match_dup 3) + (plus:SWI248 (match_dup 1) (match_dup 2)))]) + (set (match_dup 0) + (if_then_else:SWI248 + (<ovf_add_cmp> (reg:CCC FLAGS_REG) (const_int 0)) + (match_dup 3) + (match_dup ovf_comm)))] +{ + operands[<ovf_comm>] = force_reg (<MODE>mode, operands[<ovf_comm>]); + operands[3] = gen_reg_rtx (<MODE>mode); +}) + +;; umax (a, sub (a, b)) => [diff, udf] = sub (a, b); udf ? diff : a +;; umin (a, sub (a, b)) => [diff, udf] = sub (a, b); udf ? a : diff + +(define_code_attr udf_sub_cmp [(umax "ltu") (umin "geu")]) + +(define_insn_and_split "*minus_within_<code><mode>3" + [(set (match_operand:SWI248 0 "register_operand") + (umaxmin:SWI248 + (minus:SWI248 (match_operand:SWI248 1 "nonimmediate_operand") + (match_operand:SWI248 2 "<general_operand>")) + (match_dup 1))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_CMOVE + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(parallel + [(set (reg:CC FLAGS_REG) + (compare:CC (match_dup 1) (match_dup 2))) + (set (match_dup 3) + (minus:SWI248 (match_dup 1) (match_dup 2)))]) + (set (match_dup 0) + (if_then_else:SWI248 + (<udf_sub_cmp> (reg:CC FLAGS_REG) (const_int 0)) + (match_dup 3) + (match_dup 1)))] +{ + operands[1] = force_reg (<MODE>mode, operands[1]); + operands[3] = gen_reg_rtx (<MODE>mode); +}) ;; Misc patterns (?) @@ -29471,7 +29711,7 @@ [(match_operand:SI 0 "register_operand") (match_operand:SI 1 "register_operand") (match_operand:SWI124 2 "nonimmediate_operand") - (match_operand:SI 3)] + (match_operand:SI 3 "const_int_operand")] "TARGET_CRC32" { /* crc32 uses iSCSI polynomial */ diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index 8449450..c0093ef 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -1290,11 +1290,11 @@ Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and SM4 built-in functions and code generation. mgather -Target Alias(mtune-ctrl=, use_gather, ^use_gather) +Target Var(ix86_use_gather) Init(0) Optimization. Enable vectorization for gather instruction. mscatter -Target Alias(mtune-ctrl=, use_scatter, ^use_scatter) +Target Var(ix86_use_scatter) Init(0) Optimization Enable vectorization for scatter instruction. mapxf diff --git a/gcc/config/i386/i386.opt.urls b/gcc/config/i386/i386.opt.urls index a9bbac0..129d91f 100644 --- a/gcc/config/i386/i386.opt.urls +++ b/gcc/config/i386/i386.opt.urls @@ -13,10 +13,10 @@ mlong-double-80 UrlSuffix(gcc/x86-Options.html#index-mlong-double-80) mlong-double-64 -UrlSuffix(gcc/x86-Options.html#index-mlong-double-64-1) +UrlSuffix(gcc/x86-Options.html#index-mlong-double-64-2) mlong-double-128 -UrlSuffix(gcc/x86-Options.html#index-mlong-double-128-1) +UrlSuffix(gcc/x86-Options.html#index-mlong-double-128-2) maccumulate-outgoing-args UrlSuffix(gcc/x86-Options.html#index-maccumulate-outgoing-args-1) @@ -57,7 +57,7 @@ UrlSuffix(gcc/x86-Options.html#index-mfp-ret-in-387) ; duplicate: 'gcc/x86-Options.html#index-mfpmath-1' mhard-float -UrlSuffix(gcc/x86-Options.html#index-mhard-float-11) +UrlSuffix(gcc/x86-Options.html#index-mhard-float-10) mieee-fp UrlSuffix(gcc/x86-Options.html#index-mieee-fp) @@ -120,7 +120,7 @@ mrtd UrlSuffix(gcc/x86-Options.html#index-mrtd-1) msoft-float -UrlSuffix(gcc/x86-Options.html#index-msoft-float-16) +UrlSuffix(gcc/x86-Options.html#index-msoft-float-15) msseregparm UrlSuffix(gcc/x86-Options.html#index-msseregparm) @@ -438,7 +438,7 @@ mpku UrlSuffix(gcc/x86-Options.html#index-mpku) mstack-protector-guard= -UrlSuffix(gcc/x86-Options.html#index-mstack-protector-guard-4) +UrlSuffix(gcc/x86-Options.html#index-mstack-protector-guard-5) mstack-protector-guard-reg= UrlSuffix(gcc/x86-Options.html#index-mstack-protector-guard-reg-3) diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 57950d3..2863b3e 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1319,9 +1319,6 @@ (ior (match_operand 0 "nonimmediate_operand") (match_test "const_vec_duplicate_p (op)"))) -(define_predicate "const_vec_dup_operand" - (match_test "const_vec_duplicate_p (op)")) - ;; Return true when OP is either register operand, or any ;; CONST_VECTOR. (define_predicate "reg_or_const_vector_operand" @@ -1587,6 +1584,9 @@ (define_predicate "add_comparison_operator" (match_code "geu,ltu")) +(define_predicate "ieee_maxmin_comparison_operator" + (match_code "lt,gt")) + ;; Return true if OP is a valid comparison operator in valid mode. (define_predicate "ix86_comparison_operator" (match_operand 0 "comparison_operator") diff --git a/gcc/config/i386/sol2.h b/gcc/config/i386/sol2.h index 013e87f..3720424 100644 --- a/gcc/config/i386/sol2.h +++ b/gcc/config/i386/sol2.h @@ -60,7 +60,7 @@ along with GCC; see the file COPYING3. If not see /* GNU as understands --32 and --64, but the native Solaris assembler requires -xarch=generic or -xarch=generic64 instead. */ -#ifdef USE_GAS +#if HAVE_GNU_AS #define ASM_CPU32_DEFAULT_SPEC "--32" #define ASM_CPU64_DEFAULT_SPEC "--64" #else @@ -90,16 +90,9 @@ along with GCC; see the file COPYING3. If not see #define ARCH64_SUBDIR "amd64" -#ifdef USE_GLD -/* Since binutils 2.21, GNU ld supports new *_sol2 emulations to strictly - follow the Solaris 2 ABI. Prefer them if present. */ -#ifdef HAVE_LD_SOL2_EMULATION +#if HAVE_GNU_LD #define ARCH32_EMULATION "elf_i386_sol2" #define ARCH64_EMULATION "elf_x86_64_sol2" -#else -#define ARCH32_EMULATION "elf_i386" -#define ARCH64_EMULATION "elf_x86_64" -#endif #endif #define ENDFILE_ARCH_SPEC \ @@ -156,7 +149,7 @@ along with GCC; see the file COPYING3. If not see } \ } while (0) -#ifndef USE_GAS +#if !HAVE_GNU_AS /* The Sun assembler uses .tcomm for TLS common sections. */ #define TLS_COMMON_ASM_OP ".tcomm" @@ -186,7 +179,7 @@ along with GCC; see the file COPYING3. If not see ASM_OUTPUT_LABEL (FILE, NAME); \ } \ while (0) -#endif /* !USE_GAS */ +#endif /* !HAVE_GNU_AS */ /* As in sparc/sol2.h, override the default from i386/x86-64.h to work around Sun as TLS bug. */ @@ -217,13 +210,13 @@ along with GCC; see the file COPYING3. If not see /* Sun as requires "h" flag for large sections, GNU as can do without, but accepts "l". */ -#ifdef USE_GAS +#if HAVE_GNU_AS #define MACH_DEP_SECTION_ASM_FLAG 'l' #else #define MACH_DEP_SECTION_ASM_FLAG 'h' #endif -#ifndef USE_GAS +#if !HAVE_GNU_AS /* Emit COMDAT group signature symbols for Sun as. */ #undef TARGET_ASM_FILE_END #define TARGET_ASM_FILE_END solaris_file_end @@ -231,12 +224,12 @@ along with GCC; see the file COPYING3. If not see /* Unlike GNU ld, Sun ld doesn't coalesce .ctors.N/.dtors.N sections, so inhibit their creation. Also cf. sparc/sysv4.h. */ -#ifndef USE_GLD +#if !HAVE_GNU_LD #define CTORS_SECTION_ASM_OP "\t.section\t.ctors, \"aw\"" #define DTORS_SECTION_ASM_OP "\t.section\t.dtors, \"aw\"" #endif -#ifndef USE_GAS +#if !HAVE_GNU_AS #define LARGECOMM_SECTION_ASM_OP "\t.lbcomm\t" #endif diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 5eba992..fb79b2e 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -643,6 +643,9 @@ (define_mode_iterator VI2_AVX512F [(V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX2") V8HI]) +(define_mode_iterator VI2_AVX10_2 + [(V32HI "TARGET_AVX10_2") (V16HI "TARGET_AVX2") V8HI]) + (define_mode_iterator VI2_AVX512VNNIBW [(V32HI "TARGET_AVX512BW || TARGET_AVX512VNNI") (V16HI "TARGET_AVX2") V8HI]) @@ -3335,10 +3338,10 @@ [(set (match_operand:VFH 0 "register_operand") (vec_merge:VFH (match_operand:VFH 1 "nonimmediate_operand") - (match_operand:VFH 2 "nonimmediate_operand") + (match_operand:VFH 2 "general_operand") (unspec:<avx512fmaskmode> [(match_operand:VFH 3 "nonimmediate_operand") - (match_operand:VFH 4 "nonimmediate_operand") + (match_operand:VFH 4 "general_operand") (match_operand:SI 5 "const_0_to_31_operand")] UNSPEC_PCMP)))] "TARGET_SSE && ix86_pre_reload_split () @@ -3349,19 +3352,21 @@ && (INTVAL (operands[5]) == 1 || INTVAL (operands[5]) == 14)" "#" "&& 1" - [(const_int 0)] + [(set (match_dup 0) (match_dup 6))] { int u = UNSPEC_IEEE_MIN; + rtx tmp = operands[2]; if ((INTVAL (operands[5]) == 1 && rtx_equal_p (operands[1], operands[4])) || (INTVAL (operands[5]) == 14 && rtx_equal_p (operands[1], operands[3]))) u = UNSPEC_IEEE_MAX; if (MEM_P (operands[1])) operands[1] = force_reg (<MODE>mode, operands[1]); - rtvec v = gen_rtvec (2, operands[1], operands[2]); - rtx tmp = gen_rtx_UNSPEC (<MODE>mode, v, u); - emit_move_insn (operands[0], tmp); - DONE; + + if (immediate_operand (operands[2], <MODE>mode)) + tmp = force_reg (<MODE>mode, operands[2]); + rtvec v = gen_rtvec (2, operands[1], tmp); + operands[6] = gen_rtx_UNSPEC (<MODE>mode, v, u); }) (define_insn_and_split "*minmax<mode>3_2" @@ -3380,7 +3385,7 @@ && rtx_equal_p (operands[2], operands[3])))" "#" "&& 1" - [(const_int 0)] + [(set (match_dup 0) (match_dup 5))] { int u = UNSPEC_IEEE_MIN; if (rtx_equal_p (operands[1], operands[3])) @@ -3389,9 +3394,53 @@ if (MEM_P (operands[2])) operands[2] = force_reg (<MODE>mode, operands[2]); rtvec v = gen_rtvec (2, operands[2], operands[1]); - rtx tmp = gen_rtx_UNSPEC (<MODE>mode, v, u); - emit_move_insn (operands[0], tmp); - DONE; + operands[5] = gen_rtx_UNSPEC (<MODE>mode, v, u); + }) + + +(define_insn_and_split "*minmax<mode>3_3" + [(set (match_operand:VF_128_256 0 "register_operand") + (and:VF_128_256 + (not:VF_128_256 + (match_operator:VF_128_256 1 "ieee_maxmin_comparison_operator" + [(match_operand:VF_128_256 2 "nonimmediate_operand") + (match_operand:VF_128_256 3 "const0_operand")])) + (match_operand:VF_128_256 4 "nonimmediate_operand")))] + "TARGET_SSE && ix86_pre_reload_split () + && rtx_equal_p (operands[2], operands[4])" + "#" + "&& 1" + [(set (match_dup 0) (match_dup 5))] + { + int u = UNSPEC_IEEE_MIN; + if (GET_CODE (operands[1]) == LT) + u = UNSPEC_IEEE_MAX; + + rtx tmp = force_reg (<MODE>mode, operands[3]); + rtvec v = gen_rtvec (2, tmp, operands[2]); + operands[5] = gen_rtx_UNSPEC (<MODE>mode, v, u); + }) + +(define_insn_and_split "*minmax<mode>3_4" + [(set (match_operand:VF_128_256 0 "register_operand") + (and:VF_128_256 + (match_operator:VF_128_256 1 "ieee_maxmin_comparison_operator" + [(match_operand:VF_128_256 2 "nonimmediate_operand") + (match_operand:VF_128_256 3 "const0_operand")]) + (match_operand:VF_128_256 4 "nonimmediate_operand")))] + "TARGET_SSE && ix86_pre_reload_split () + && rtx_equal_p (operands[2], operands[4])" + "#" + "&& 1" + [(set (match_dup 0) (match_dup 5))] + { + int u = UNSPEC_IEEE_MIN; + if (GET_CODE (operands[1]) == GT) + u = UNSPEC_IEEE_MAX; + + rtx tmp = force_reg (<MODE>mode, operands[3]); + rtvec v = gen_rtvec (2, operands[2], tmp); + operands[5] = gen_rtx_UNSPEC (<MODE>mode, v, u); }) ;; These versions of the min/max patterns implement exactly the operations @@ -4650,6 +4699,9 @@ UNSPEC_PCMP))] "operands[4] = GEN_INT (INTVAL (operands[3]) ^ 4);") +(define_int_iterator UNSPEC_PCMP_ITER + [UNSPEC_PCMP UNSPEC_UNSIGNED_PCMP]) + (define_insn "*<avx512>_cmp<mode>3_and15" [(set (match_operand:QI 0 "register_operand" "=k") (and:QI @@ -4682,6 +4734,23 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn "*<avx512>_eq<mode>3_and15" + [(set (match_operand:QI 0 "register_operand" "=k, k") + (and:QI + (unspec:QI + [(match_operand:VI48_AVX512VL_4 1 "nonimm_or_0_operand" "%v, v") + (match_operand:VI48_AVX512VL_4 2 "nonimm_or_0_operand" "vm, C") + (const_int 0)] + UNSPEC_PCMP_ITER) + (const_int 15)))] + "TARGET_AVX512F && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "@ + vpcmpeq<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2} + vptestnm<ssemodesuffix>\t{%1, %1, %0|%0, %1, %1}" + [(set_attr "type" "ssecmp") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_insn "*<avx512>_cmp<mode>3_and3" [(set (match_operand:QI 0 "register_operand" "=k") (and:QI @@ -4714,6 +4783,23 @@ (set_attr "prefix" "evex") (set_attr "mode" "TI")]) +(define_insn "*avx512vl_eqv2di_and3" + [(set (match_operand:QI 0 "register_operand" "=k, k") + (and:QI + (unspec:QI + [(match_operand:V2DI 1 "nonimm_or_0_operand" "%v, v") + (match_operand:V2DI 2 "nonimm_or_0_operand" "vm, C") + (const_int 0)] + UNSPEC_PCMP_ITER) + (const_int 3)))] + "TARGET_AVX512F && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "@ + vpcmpeqq\t{%2, %1, %0|%0, %1, %2} + vptestnmq\t{%1, %1, %0|%0, %1, %1}" + [(set_attr "type" "ssecmp") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name>" [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (unspec:<avx512fmaskmode> @@ -4787,9 +4873,6 @@ (set_attr "prefix" "evex") (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")]) -(define_int_iterator UNSPEC_PCMP_ITER - [UNSPEC_PCMP UNSPEC_UNSIGNED_PCMP]) - (define_insn_and_split "*<avx512>_cmp<mode>3" [(set (match_operand:<avx512fmaskmode> 0 "register_operand") (not:<avx512fmaskmode> @@ -4817,7 +4900,8 @@ (match_operand:SI 3 "<cmp_imm_predicate>")] UNSPEC_PCMP_ITER))] "TARGET_AVX512F && ix86_pre_reload_split () - && rtx_equal_p (operands[1], operands[2])" + && rtx_equal_p (operands[1], operands[2]) + && (!MEM_P (operands[1]) || !MEM_VOLATILE_P (operands[1]))" "#" "&& 1" [(set (match_dup 0) (match_dup 4))] @@ -27256,24 +27340,6 @@ DONE; }) -(define_expand "cond_<insn><mode>" - [(set (match_operand:VI1_AVX512VL 0 "register_operand") - (vec_merge:VI1_AVX512VL - (any_shift:VI1_AVX512VL - (match_operand:VI1_AVX512VL 2 "register_operand") - (match_operand:VI1_AVX512VL 3 "const_vec_dup_operand")) - (match_operand:VI1_AVX512VL 4 "nonimm_or_0_operand") - (match_operand:<avx512fmaskmode> 1 "register_operand")))] - "TARGET_GFNI && TARGET_AVX512F" -{ - rtx count = XVECEXP (operands[3], 0, 0); - rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], count, <CODE>); - emit_insn (gen_vgf2p8affineqb_<mode>_mask (operands[0], operands[2], matrix, - const0_rtx, operands[4], - operands[1])); - DONE; -}) - (define_expand "<insn><mode>3" [(set (match_operand:VI1_AVX512_3264 0 "register_operand") (any_rotate:VI1_AVX512_3264 @@ -32334,8 +32400,8 @@ (define_expand "usdot_prod<sseunpackmodelower><mode>" [(match_operand:<sseunpackmode> 0 "register_operand") - (match_operand:VI2_AVX512F 1 "register_operand") - (match_operand:VI2_AVX512F 2 "register_operand") + (match_operand:VI2_AVX10_2 1 "register_operand") + (match_operand:VI2_AVX10_2 2 "register_operand") (match_operand:<sseunpackmode> 3 "register_operand")] "TARGET_AVXVNNIINT16 || TARGET_AVX10_2" { @@ -32352,8 +32418,8 @@ (define_expand "udot_prod<sseunpackmodelower><mode>" [(match_operand:<sseunpackmode> 0 "register_operand") - (match_operand:VI2_AVX512F 1 "register_operand") - (match_operand:VI2_AVX512F 2 "register_operand") + (match_operand:VI2_AVX10_2 1 "register_operand") + (match_operand:VI2_AVX10_2 2 "register_operand") (match_operand:<sseunpackmode> 3 "register_operand")] "TARGET_AVXVNNIINT16 || TARGET_AVX10_2" { diff --git a/gcc/config/i386/x-mingw32 b/gcc/config/i386/x-mingw32 index 8900bfc..5ebe088 100644 --- a/gcc/config/i386/x-mingw32 +++ b/gcc/config/i386/x-mingw32 @@ -21,6 +21,9 @@ # local_includedir=$(libsubdir)/$(unlibsubdir)/..`echo $(exec_prefix) | sed -e 's|^$(prefix)||' -e 's|/[^/]*|/..|g'`/include +# Add Windows socket library. +LIBS += -lws2_32 + # On MinGW, we use "%IA64d" to print 64-bit integers, and the format-checking # code does not handle that, so we have to disable checking here. WERROR_FLAGS += -Wno-format diff --git a/gcc/config/i386/x86-tune-sched.cc b/gcc/config/i386/x86-tune-sched.cc index ff9c268..11b3338 100644 --- a/gcc/config/i386/x86-tune-sched.cc +++ b/gcc/config/i386/x86-tune-sched.cc @@ -110,6 +110,9 @@ ix86_issue_rate (void) case PROCESSOR_PANTHERLAKE: return 6; + case PROCESSOR_NOVALAKE: + return 8; + default: return 1; } diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 3627312..dcd26d5 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -602,7 +602,7 @@ DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2 /* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX instructions in the auto-vectorizer. */ -DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512) +DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512 | m_NOVALAKE) /* X86_TUNE_AVX256_AVOID_VEC_PERM: Avoid using 256-bit cross-lane vector permutation instructions in the auto-vectorizer. */ |
