aboutsummaryrefslogtreecommitdiff
path: root/gcc/config/i386
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config/i386')
-rw-r--r--gcc/config/i386/amxavx512intrin.h57
-rw-r--r--gcc/config/i386/amxbf16intrin.h6
-rw-r--r--gcc/config/i386/amxcomplexintrin.h18
-rw-r--r--gcc/config/i386/amxfp16intrin.h8
-rw-r--r--gcc/config/i386/amxfp8intrin.h38
-rw-r--r--gcc/config/i386/amxint8intrin.h4
-rw-r--r--gcc/config/i386/amxmovrsintrin.h20
-rw-r--r--gcc/config/i386/amxtf32intrin.h6
-rw-r--r--gcc/config/i386/amxtileintrin.h20
-rw-r--r--gcc/config/i386/driver-i386.cc6
-rw-r--r--gcc/config/i386/i386-expand.cc771
-rw-r--r--gcc/config/i386/i386-features.cc11
-rw-r--r--gcc/config/i386/i386-options.cc17
-rw-r--r--gcc/config/i386/i386-protos.h1
-rw-r--r--gcc/config/i386/i386.cc48
-rw-r--r--gcc/config/i386/i386.h6
-rw-r--r--gcc/config/i386/i386.md268
-rw-r--r--gcc/config/i386/i386.opt4
-rw-r--r--gcc/config/i386/i386.opt.urls10
-rw-r--r--gcc/config/i386/predicates.md6
-rw-r--r--gcc/config/i386/sol2.h23
-rw-r--r--gcc/config/i386/sse.md140
-rw-r--r--gcc/config/i386/x-mingw323
-rw-r--r--gcc/config/i386/x86-tune-sched.cc3
-rw-r--r--gcc/config/i386/x86-tune.def2
25 files changed, 1325 insertions, 171 deletions
diff --git a/gcc/config/i386/amxavx512intrin.h b/gcc/config/i386/amxavx512intrin.h
index ab53625..1e28460 100644
--- a/gcc/config/i386/amxavx512intrin.h
+++ b/gcc/config/i386/amxavx512intrin.h
@@ -39,8 +39,9 @@
({ \
__m512 dst; \
__asm__ volatile \
- ("{tcvtrowd2ps\t%1, %%tmm"#src", %0|tcvtrowd2ps\t%0, %%tmm"#src", %1}" \
- : "=v" (dst) : "r" ((unsigned) (A))); \
+ ("{tcvtrowd2ps\t%1, %%tmm%c[_src], %0 \
+ |tcvtrowd2ps\t%0, tmm%c[_src], %1}" \
+ : "=v" (dst) : "r" ((unsigned) (A)), [_src]"i"(src)); \
dst; \
})
@@ -48,8 +49,9 @@
({ \
__m512 dst; \
__asm__ volatile \
- ("{tcvtrowd2ps\t$"#imm", %%tmm"#src", %0|tcvtrowd2ps\t%0, %%tmm"#src", "#imm"}" \
- : "=v" (dst) :); \
+ ("{tcvtrowd2ps\t%[_imm], %%tmm%c[_src], %0 \
+ |tcvtrowd2ps\t%0, tmm%c[_src], %[_imm]}" \
+ : "=v" (dst) : [_src]"i"(src), [_imm]"i"(imm)); \
dst; \
})
@@ -57,8 +59,9 @@
({ \
__m512bh dst; \
__asm__ volatile \
- ("{tcvtrowps2bf16h\t%1, %%tmm"#src", %0|tcvtrowps2bf16h\t%0, %%tmm"#src", %1}" \
- : "=v" (dst) : "r" ((unsigned) (A))); \
+ ("{tcvtrowps2bf16h\t%1, %%tmm%c[_src], %0 \
+ |tcvtrowps2bf16h\t%0, tmm%c[_src], %1}" \
+ : "=v" (dst) : "r" ((unsigned) (A)), [_src]"i"(src)); \
dst; \
})
@@ -66,8 +69,9 @@
({ \
__m512bh dst; \
__asm__ volatile \
- ("{tcvtrowps2bf16h\t$"#imm", %%tmm"#src", %0|tcvtrowps2bf16h\t%0, %%tmm"#src", "#imm"}" \
- : "=v" (dst) :); \
+ ("{tcvtrowps2bf16h\t%[_imm], %%tmm%c[_src], %0 \
+ |tcvtrowps2bf16h\t%0, tmm%c[_src], %[_imm]}" \
+ : "=v" (dst) : [_src]"i"(src), [_imm]"i"(imm)); \
dst; \
})
@@ -75,8 +79,9 @@
({ \
__m512bh dst; \
__asm__ volatile \
- ("{tcvtrowps2bf16l\t%1, %%tmm"#src", %0|tcvtrowps2bf16l\t%0, %%tmm"#src", %1}" \
- : "=v" (dst) : "r" ((unsigned) (A))); \
+ ("{tcvtrowps2bf16l\t%1, %%tmm%c[_src], %0 \
+ |tcvtrowps2bf16l\t%0, tmm%c[_src], %1}" \
+ : "=v" (dst) : "r" ((unsigned) (A)), [_src]"i"(src)); \
dst; \
})
@@ -84,8 +89,9 @@
({ \
__m512bh dst; \
__asm__ volatile \
- ("{tcvtrowps2bf16l\t$"#imm", %%tmm"#src", %0|tcvtrowps2bf16l\t%0, %%tmm"#src", "#imm"}" \
- : "=v" (dst) :); \
+ ("{tcvtrowps2bf16l\t%[_imm], %%tmm%c[_src], %0 \
+ |tcvtrowps2bf16l\t%0, tmm%c[_src], "#imm"}" \
+ : "=v" (dst) : [_src]"i"(src), [_imm]"i"(imm)); \
dst; \
})
@@ -93,8 +99,8 @@
({ \
__m512h dst; \
__asm__ volatile \
- ("{tcvtrowps2phh\t%1, %%tmm"#src", %0|tcvtrowps2phh\t%0, %%tmm"#src", %1}" \
- : "=v" (dst) : "r" ((unsigned) (A))); \
+ ("{tcvtrowps2phh\t%1, %%tmm%c[_src], %0|tcvtrowps2phh\t%0, tmm%c[_src], %1}" \
+ : "=v" (dst) : "r" ((unsigned) (A)), [_src]"i"(src)); \
dst; \
})
@@ -102,8 +108,9 @@
({ \
__m512h dst; \
__asm__ volatile \
- ("{tcvtrowps2phh\t$"#imm", %%tmm"#src", %0|tcvtrowps2phh\t%0, %%tmm"#src", "#imm"}" \
- : "=v" (dst) :); \
+ ("{tcvtrowps2phh\t%[_imm], %%tmm%c[_src], %0 \
+ |tcvtrowps2phh\t%0, tmm%c[_src], "#imm"}" \
+ : "=v" (dst) : [_src]"i"(src), [_imm]"i"(imm)); \
dst; \
})
@@ -111,8 +118,8 @@
({ \
__m512h dst; \
__asm__ volatile \
- ("{tcvtrowps2phl\t%1, %%tmm"#src", %0|tcvtrowps2phl\t%0, %%tmm"#src", %1}" \
- : "=v" (dst) : "r" ((unsigned) (A))); \
+ ("{tcvtrowps2phl\t%1, %%tmm%c[_src], %0|tcvtrowps2phl\t%0, tmm%c[_src], %1}" \
+ : "=v" (dst) : "r" ((unsigned) (A)), [_src]"i"(src)); \
dst; \
})
@@ -120,8 +127,9 @@
({ \
__m512h dst; \
__asm__ volatile \
- ("{tcvtrowps2phl\t$"#imm", %%tmm"#src", %0|tcvtrowps2phl\t%0, %%tmm"#src", "#imm"}" \
- : "=v" (dst) :); \
+ ("{tcvtrowps2phl\t%[_imm], %%tmm%c[_src], %0 \
+ |tcvtrowps2phl\t%0, tmm%c[_src], "#imm"}" \
+ : "=v" (dst) : [_src]"i"(src), [_imm]"i"(imm)); \
dst; \
})
@@ -129,8 +137,8 @@
({ \
__m512 dst; \
__asm__ volatile \
- ("{tilemovrow\t%1, %%tmm"#src", %0|tilemovrow\t%0, %%tmm"#src", %1}" \
- : "=v" (dst) : "r" ((unsigned) (A))); \
+ ("{tilemovrow\t%1, %%tmm%c[_src], %0|tilemovrow\t%0, tmm%c[_src], %1}" \
+ : "=v" (dst) : "r" ((unsigned) (A)), [_src]"i"(src)); \
dst; \
})
@@ -138,8 +146,9 @@
({ \
__m512 dst; \
__asm__ volatile \
- ("{tilemovrow\t$"#imm", %%tmm"#src", %0|tilemovrow\t%0, %%tmm"#src", "#imm"}" \
- : "=v" (dst) :); \
+ ("{tilemovrow\t%[_imm], %%tmm%c[_src], %0 \
+ |tilemovrow\t%0, tmm%c[_src], "#imm"}" \
+ : "=v" (dst) : [_src]"i"(src), [_imm]"i"(imm)); \
dst; \
})
diff --git a/gcc/config/i386/amxbf16intrin.h b/gcc/config/i386/amxbf16intrin.h
index 9f4a9d1..b2792bb 100644
--- a/gcc/config/i386/amxbf16intrin.h
+++ b/gcc/config/i386/amxbf16intrin.h
@@ -36,8 +36,10 @@
#if defined(__x86_64__)
#define _tile_dpbf16ps_internal(dst,src1,src2) \
- __asm__ volatile\
- ("{tdpbf16ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpbf16ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
+ __asm__ volatile \
+ ("{tdpbf16ps\t%%tmm%c[_src2], %%tmm%c[_src1], %%tmm%c[_dst] \
+ |tdpbf16ps\ttmm%c[_dst], tmm%c[_src1], tmm%c[_src2]}" \
+ :: [_dst]"i"(dst), [_src1]"i"(src1), [_src2]"i"(src2))
#define _tile_dpbf16ps(dst,src1,src2) \
_tile_dpbf16ps_internal (dst, src1, src2)
diff --git a/gcc/config/i386/amxcomplexintrin.h b/gcc/config/i386/amxcomplexintrin.h
index fc5964f..55b7d53 100644
--- a/gcc/config/i386/amxcomplexintrin.h
+++ b/gcc/config/i386/amxcomplexintrin.h
@@ -35,13 +35,17 @@
#endif /* __AMX_COMPLEX__ */
#if defined(__x86_64__)
-#define _tile_cmmimfp16ps_internal(src1_dst,src2,src3) \
- __asm__ volatile\
- ("{tcmmimfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tcmmimfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::)
-
-#define _tile_cmmrlfp16ps_internal(src1_dst,src2,src3) \
- __asm__ volatile\
- ("{tcmmrlfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tcmmrlfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::)
+#define _tile_cmmimfp16ps_internal(src1_dst,src2,src3) \
+ __asm__ volatile \
+ ("{tcmmimfp16ps\t%%tmm%c[_src3], %%tmm%c[_src2], %%tmm%c[_src1_dst] \
+ |tcmmimfp16ps\ttmm%c[_src1_dst], tmm%c[_src2], tmm%c[_src3]}" \
+ :: [_src1_dst]"i"(src1_dst), [_src2]"i"(src2), [_src3]"i"(src3))
+
+#define _tile_cmmrlfp16ps_internal(src1_dst,src2,src3) \
+ __asm__ volatile \
+ ("{tcmmrlfp16ps\t%%tmm%c[_src3], %%tmm%c[_src2], %%tmm%c[_src1_dst] \
+ |tcmmrlfp16ps\ttmm%c[_src1_dst], tmm%c[_src2], tmm%c[_src3]}" \
+ :: [_src1_dst]"i"(src1_dst), [_src2]"i"(src2), [_src3]"i"(src3))
#define _tile_cmmimfp16ps(src1_dst,src2,src3) \
_tile_cmmimfp16ps_internal (src1_dst, src2, src3)
diff --git a/gcc/config/i386/amxfp16intrin.h b/gcc/config/i386/amxfp16intrin.h
index 02fd031..1e0ef27 100644
--- a/gcc/config/i386/amxfp16intrin.h
+++ b/gcc/config/i386/amxfp16intrin.h
@@ -29,9 +29,11 @@
#define _AMXFP16INTRIN_H_INCLUDED
#if defined(__x86_64__)
-#define _tile_dpfp16ps_internal(dst,src1,src2) \
- __asm__ volatile \
- ("{tdpfp16ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpfp16ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
+#define _tile_dpfp16ps_internal(dst,src1,src2) \
+ __asm__ volatile \
+ ("{tdpfp16ps\t%%tmm%c[_src2], %%tmm%c[_src1], %%tmm%c[_dst] \
+ |tdpfp16ps\ttmm%c[_dst], tmm%c[_src1], tmm%c[_src2]}" \
+ :: [_dst]"i"(dst), [_src1]"i"(src1), [_src2]"i"(src2))
#define _tile_dpfp16ps(dst,src1,src2) \
_tile_dpfp16ps_internal (dst,src1,src2)
diff --git a/gcc/config/i386/amxfp8intrin.h b/gcc/config/i386/amxfp8intrin.h
index 8952be9..9467f53 100644
--- a/gcc/config/i386/amxfp8intrin.h
+++ b/gcc/config/i386/amxfp8intrin.h
@@ -29,21 +29,29 @@
#define _AMXFP8INTRIN_H_INCLUDED
#if defined(__x86_64__)
-#define _tile_dpbf8ps_internal(dst,src1,src2) \
- __asm__ volatile \
- ("{tdpbf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpbf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
-
-#define _tile_dpbhf8ps_internal(dst,src1,src2) \
- __asm__ volatile \
- ("{tdpbhf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpbhf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
-
-#define _tile_dphbf8ps_internal(dst,src1,src2) \
- __asm__ volatile \
- ("{tdphbf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdphbf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
-
-#define _tile_dphf8ps_internal(dst,src1,src2) \
- __asm__ volatile \
- ("{tdphf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdphf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
+#define _tile_dpbf8ps_internal(dst,src1,src2) \
+ __asm__ volatile \
+ ("{tdpbf8ps\t%%tmm%c[_src2], %%tmm%c[_src1], %%tmm%c[_dst] \
+ |tdpbf8ps\ttmm%c[_dst], tmm%c[_src1], tmm%c[_src2]}" \
+ :: [_dst]"i"(dst), [_src1]"i"(src1), [_src2]"i"(src2))
+
+#define _tile_dpbhf8ps_internal(dst,src1,src2) \
+ __asm__ volatile \
+ ("{tdpbhf8ps\t%%tmm%c[_src2], %%tmm%c[_src1], %%tmm%c[_dst] \
+ |tdpbhf8ps\ttmm%c[_dst], tmm%c[_src1], tmm%c[_src2]}" \
+ :: [_dst]"i"(dst), [_src1]"i"(src1), [_src2]"i"(src2))
+
+#define _tile_dphbf8ps_internal(dst,src1,src2) \
+ __asm__ volatile \
+ ("{tdphbf8ps\t%%tmm%c[_src2], %%tmm%c[_src1], %%tmm%c[_dst] \
+ |tdphbf8ps\ttmm%c[_dst], tmm%c[_src1], tmm%c[_src2]}" \
+ :: [_dst]"i"(dst), [_src1]"i"(src1), [_src2]"i"(src2))
+
+#define _tile_dphf8ps_internal(dst,src1,src2) \
+ __asm__ volatile \
+ ("{tdphf8ps\t%%tmm%c[_src2], %%tmm%c[_src1], %%tmm%c[_dst] \
+ |tdphf8ps\ttmm%c[_dst], tmm%c[_src1], tmm%c[_src2]}" \
+ :: [_dst]"i"(dst), [_src1]"i"(src1), [_src2]"i"(src2))
#define _tile_dpbf8ps(dst,src1,src2) \
_tile_dpbf8ps_internal (dst,src1,src2)
diff --git a/gcc/config/i386/amxint8intrin.h b/gcc/config/i386/amxint8intrin.h
index 332c8db..f7cb36c 100644
--- a/gcc/config/i386/amxint8intrin.h
+++ b/gcc/config/i386/amxint8intrin.h
@@ -37,7 +37,9 @@
#if defined(__x86_64__)
#define _tile_int8_dp_internal(name,dst,src1,src2) \
__asm__ volatile \
- ("{"#name"\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|"#name"\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
+ ("{"#name"\t%%tmm%c[_src2], %%tmm%c[_src1], %%tmm%c[_dst] \
+ |"#name"\ttmm%c[_dst], tmm%c[_src1], tmm%c[_src2]}" \
+ ::[_dst]"i"(dst),[_src1]"i"(src1),[_src2]"i"(src2))
#define _tile_dpbssd(dst,src1,src2) \
_tile_int8_dp_internal (tdpbssd, dst, src1, src2)
diff --git a/gcc/config/i386/amxmovrsintrin.h b/gcc/config/i386/amxmovrsintrin.h
index 93a2dbf..9f5d317 100644
--- a/gcc/config/i386/amxmovrsintrin.h
+++ b/gcc/config/i386/amxmovrsintrin.h
@@ -36,17 +36,17 @@
#define __DISABLE_AMX_MOVRS__
#endif /* __AMX_MOVRS__ */
-#define _tile_loaddrs_internal(tdst, base, stride) \
-__asm__ volatile \
- ("{tileloaddrs\t(%0,%1,1), %%tmm"#tdst \
- "|tileloaddrs\t%%tmm"#tdst", [%0+%1*1]}" \
- :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)))
+#define _tile_loaddrs_internal(tdst, base, stride) \
+__asm__ volatile \
+ ("{tileloaddrs\t(%0,%1,1), %%tmm%c[_tdst] \
+ |tileloaddrs\ttmm%c[_tdst], [%0+%1*1]}" \
+ :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)), [_tdst]"i"(tdst))
-#define _tile_loaddrst1_internal(tdst, base, stride) \
-__asm__ volatile \
- ("{tileloaddrst1\t(%0,%1,1), %%tmm"#tdst \
- "|tileloaddrst1\t%%tmm"#tdst", [%0+%1*1]}" \
- :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)))
+#define _tile_loaddrst1_internal(tdst, base, stride) \
+__asm__ volatile \
+ ("{tileloaddrst1\t(%0,%1,1), %%tmm%c[_tdst] \
+ |tileloaddrst1\ttmm%c[_tdst], [%0+%1*1]}" \
+ :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)), [_tdst]"i"(tdst))
#define _tile_loaddrs(tdst, base, stride) \
_tile_loaddrs_internal(tdst, base, stride)
diff --git a/gcc/config/i386/amxtf32intrin.h b/gcc/config/i386/amxtf32intrin.h
index 8ed910d..a7a1f4f 100644
--- a/gcc/config/i386/amxtf32intrin.h
+++ b/gcc/config/i386/amxtf32intrin.h
@@ -31,8 +31,10 @@
#if defined(__x86_64__)
#define _tile_mmultf32ps_internal(src1_dst,src2,src3) \
- __asm__ volatile\
- ("{tmmultf32ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tmmultf32ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::)
+ __asm__ volatile \
+ ("{tmmultf32ps\t%%tmm%c[_src3], %%tmm%c[_src2], %%tmm%c[_src1_dst] \
+ |tmmultf32ps\ttmm%c[_src1_dst], tmm%c[_src2], tmm%c[_src3]}" \
+ :: [_src1_dst]"i"(src1_dst), [_src2]"i"(src2), [_src3]"i"(src3))
#define _tile_mmultf32ps(src1_dst,src2,src3) \
_tile_mmultf32ps_internal (src1_dst, src2, src3)
diff --git a/gcc/config/i386/amxtileintrin.h b/gcc/config/i386/amxtileintrin.h
index 8c8e2cd..67c6b53 100644
--- a/gcc/config/i386/amxtileintrin.h
+++ b/gcc/config/i386/amxtileintrin.h
@@ -61,32 +61,32 @@ _tile_release (void)
#define _tile_loadd_internal(dst,base,stride) \
__asm__ volatile \
- ("{tileloadd\t(%0,%1,1), %%tmm"#dst"|tileloadd\t%%tmm"#dst", [%0+%1*1]}" \
- :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)))
+ ("{tileloadd\t(%0,%1,1), %%tmm%c[_dst]|tileloadd\ttmm%c[_dst], [%0+%1*1]}" \
+ :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)), [_dst]"i"(dst))
#define _tile_stream_loadd(dst,base,stride) \
_tile_stream_loadd_internal (dst, base, stride)
#define _tile_stream_loadd_internal(dst,base,stride) \
__asm__ volatile \
- ("{tileloaddt1\t(%0,%1,1), %%tmm"#dst"|tileloaddt1\t%%tmm"#dst", [%0+%1*1]}" \
- :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)))
+ ("{tileloaddt1\t(%0,%1,1), %%tmm%c[_dst]|tileloaddt1\ttmm%c[_dst], [%0+%1*1]}" \
+ :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)), [_dst]"i"(dst))
#define _tile_stored(dst,base,stride) \
_tile_stored_internal (dst, base, stride)
#define _tile_stored_internal(src,base,stride) \
__asm__ volatile \
- ("{tilestored\t%%tmm"#src", (%0,%1,1)|tilestored\t[%0+%1*1], %%tmm"#src"}" \
- :: "r" ((void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)) \
- : "memory")
+ ("{tilestored\t%%tmm%c[_src], (%0,%1,1)|tilestored\t[%0+%1*1], tmm%c[_src]}" \
+ :: "r" ((void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)), [_src]"i"(src) \
+ : "memory")
#define _tile_zero(dst) \
_tile_zero_internal (dst)
-#define _tile_zero_internal(dst) \
- __asm__ volatile \
- ("tilezero\t%%tmm"#dst ::)
+#define _tile_zero_internal(dst) \
+ __asm__ volatile \
+ ("{tilezero\t%%tmm%c[_dst]|tilezero\ttmm%c[_dst]}" :: [_dst]"i"(dst))
#endif
diff --git a/gcc/config/i386/driver-i386.cc b/gcc/config/i386/driver-i386.cc
index 0557df9..b54f0af 100644
--- a/gcc/config/i386/driver-i386.cc
+++ b/gcc/config/i386/driver-i386.cc
@@ -603,6 +603,9 @@ const char *host_detect_local_cpu (int argc, const char **argv)
/* Assume Diamond Rapids. */
if (has_feature (FEATURE_AMX_FP8))
cpu = "diamondrapids";
+ /* Assume Nova Lake. */
+ else if (has_feature (FEATURE_AVX10_2))
+ cpu = "novalake";
/* Assume Granite Rapids D. */
else if (has_feature (FEATURE_AMX_COMPLEX))
cpu = "graniterapids-d";
@@ -643,9 +646,6 @@ const char *host_detect_local_cpu (int argc, const char **argv)
/* Assume Clearwater Forest. */
if (has_feature (FEATURE_USER_MSR))
cpu = "clearwaterforest";
- /* Assume Nova Lake. */
- else if (has_feature (FEATURE_PREFETCHI))
- cpu = "novalake";
else if (has_feature (FEATURE_SM3))
{
if (has_feature (FEATURE_KL))
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index a1f1b26..438fa4e 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -4159,12 +4159,18 @@ static bool
ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
rtx cmp_op1, rtx if_true, rtx if_false)
{
- machine_mode mode;
+ machine_mode mode = GET_MODE (dest);
bool is_min;
rtx tmp;
if (code == LT)
;
+ else if (code == LE && !HONOR_NANS (mode))
+ {
+ /* We can swap LE to GE and then invert to LT. */
+ std::swap (cmp_op0, cmp_op1);
+ std::swap (if_true, if_false);
+ }
else if (code == UNGE)
std::swap (if_true, if_false);
else
@@ -4177,7 +4183,6 @@ ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
else
return false;
- mode = GET_MODE (dest);
if (immediate_operand (if_false, mode))
if_false = force_reg (mode, if_false);
if (immediate_operand (if_true, mode))
@@ -9995,6 +10000,754 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
return true;
}
+/* Fully unroll memmove of known size with up to 8 registers. */
+
+static bool
+ix86_expand_unroll_movmem (rtx dst, rtx src, rtx destreg, rtx srcreg,
+ unsigned HOST_WIDE_INT count,
+ machine_mode mode)
+{
+ /* If 8 registers registers can cover all memory, load them into
+ registers and store them together to avoid possible address
+ overlap between source and destination. */
+ unsigned HOST_WIDE_INT moves = count / GET_MODE_SIZE (mode);
+ if (moves == 0)
+ {
+ mode = smallest_int_mode_for_size
+ (count * BITS_PER_UNIT).require ();
+ if (count == GET_MODE_SIZE (mode))
+ moves = 1;
+ else
+ {
+ /* Reduce the smallest move size by half so that MOVES == 1. */
+ mode = smallest_int_mode_for_size
+ (GET_MODE_BITSIZE (mode) / 2).require ();
+ moves = count / GET_MODE_SIZE (mode);
+ gcc_assert (moves == 1);
+ }
+ }
+ else if (moves > 8)
+ return false;
+
+ unsigned int i;
+ rtx tmp[9];
+
+ for (i = 0; i < moves; i++)
+ tmp[i] = gen_reg_rtx (mode);
+
+ rtx srcmem = change_address (src, mode, srcreg);
+ for (i = 0; i < moves; i++)
+ {
+ emit_move_insn (tmp[i], srcmem);
+ srcmem = offset_address (srcmem,
+ GEN_INT (GET_MODE_SIZE (mode)),
+ GET_MODE_SIZE (mode));
+ }
+
+ unsigned int epilogue_size = count & (GET_MODE_SIZE (mode) - 1);
+ machine_mode epilogue_mode = VOIDmode;
+ if (epilogue_size)
+ {
+ /* Handle the remaining bytes with overlapping move. */
+ epilogue_mode = smallest_int_mode_for_size
+ (epilogue_size * BITS_PER_UNIT).require ();
+ tmp[8] = gen_reg_rtx (epilogue_mode);
+ srcmem = adjust_address (srcmem, epilogue_mode, 0);
+ srcmem = offset_address (srcmem, GEN_INT (epilogue_size), 1);
+ srcmem = offset_address (srcmem,
+ GEN_INT (-GET_MODE_SIZE (epilogue_mode)),
+ GET_MODE_SIZE (epilogue_mode));
+ emit_move_insn (tmp[8], srcmem);
+ }
+
+ rtx destmem = change_address (dst, mode, destreg);
+ for (i = 0; i < moves; i++)
+ {
+ emit_move_insn (destmem, tmp[i]);
+ destmem = offset_address (destmem,
+ GEN_INT (GET_MODE_SIZE (mode)),
+ GET_MODE_SIZE (mode));
+ }
+
+ if (epilogue_size)
+ {
+ /* Use overlapping move. */
+ destmem = adjust_address (destmem, epilogue_mode, 0);
+ destmem = offset_address (destmem, GEN_INT (epilogue_size), 1);
+ destmem = offset_address (destmem,
+ GEN_INT (-GET_MODE_SIZE (epilogue_mode)),
+ GET_MODE_SIZE (epilogue_mode));
+ emit_move_insn (destmem, tmp[8]);
+ }
+
+ return true;
+}
+
+/* Expand memmove of size with MOVES * mode size and MOVES <= 4. If
+ FORWARD is true, copy forward. Otherwise copy backward. */
+
+static void
+ix86_expand_n_move_movmem (rtx destmem, rtx srcmem, machine_mode mode,
+ unsigned int moves, bool forward)
+{
+ gcc_assert (moves <= 4);
+
+ unsigned int i;
+ rtx tmp[8];
+
+ for (i = 0; i < moves; i++)
+ tmp[i] = gen_reg_rtx (mode);
+
+ rtx step;
+ if (forward)
+ step = GEN_INT (GET_MODE_SIZE (mode));
+ else
+ step = GEN_INT (-GET_MODE_SIZE (mode));
+
+ /* Load MOVES. */
+ for (i = 0; i < moves - 1; i++)
+ {
+ emit_move_insn (tmp[i], srcmem);
+ srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
+ }
+ emit_move_insn (tmp[i], srcmem);
+
+ /* Store MOVES. */
+ for (i = 0; i < moves - 1; i++)
+ {
+ emit_move_insn (destmem, tmp[i]);
+ destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
+ }
+ emit_move_insn (destmem, tmp[i]);
+}
+
+/* Load MOVES of mode size into REGS. If LAST is true, load the
+ last MOVES. Otherwise, load the first MOVES. */
+
+static void
+ix86_expand_load_movmem (rtx src, rtx srcreg, rtx count_exp,
+ machine_mode mode, unsigned int moves,
+ rtx regs[], bool last)
+{
+ unsigned int i;
+
+ for (i = 0; i < moves; i++)
+ regs[i] = gen_reg_rtx (mode);
+
+ rtx srcmem = change_address (src, mode, srcreg);
+ rtx step;
+ if (last)
+ {
+ srcmem = offset_address (srcmem, count_exp, 1);
+ step = GEN_INT (-GET_MODE_SIZE (mode));
+ srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
+ }
+ else
+ step = GEN_INT (GET_MODE_SIZE (mode));
+
+ for (i = 0; i < moves - 1; i++)
+ {
+ emit_move_insn (regs[i], srcmem);
+ srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
+ }
+ emit_move_insn (regs[i], srcmem);
+}
+
+/* Store MOVES of mode size into REGS. If LAST is true, store the
+ last MOVES. Otherwise, store the first MOVES. */
+
+static void
+ix86_expand_store_movmem (rtx dst, rtx destreg, rtx count_exp,
+ machine_mode mode, unsigned int moves,
+ rtx regs[], bool last)
+{
+ unsigned int i;
+
+ rtx destmem = change_address (dst, mode, destreg);
+ rtx step;
+ if (last)
+ {
+ destmem = offset_address (destmem, count_exp, 1);
+ step = GEN_INT (-GET_MODE_SIZE (mode));
+ destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
+ }
+ else
+ step = GEN_INT (GET_MODE_SIZE (mode));
+
+ for (i = 0; i < moves - 1; i++)
+ {
+ emit_move_insn (destmem, regs[i]);
+ destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
+ }
+ emit_move_insn (destmem, regs[i]);
+}
+
+/* Expand memmove of size between (MOVES / 2) * mode size and
+ MOVES * mode size with overlapping load and store. MOVES is even.
+ MOVES >= 2 and MOVES <= 8. */
+
+static void
+ix86_expand_n_overlapping_move_movmem (rtx dst, rtx src, rtx destreg,
+ rtx srcreg, rtx count_exp,
+ machine_mode mode,
+ unsigned int moves)
+{
+ gcc_assert (moves >= 2 && moves <= 8 && (moves & 1) == 0);
+
+ unsigned int half_moves = moves / 2;
+ unsigned int i, j;
+ rtx tmp[8];
+
+ for (i = 0; i < moves; i++)
+ tmp[i] = gen_reg_rtx (mode);
+
+ rtx base_srcmem = change_address (src, mode, srcreg);
+
+ /* Load the first half. */
+ rtx srcmem = base_srcmem;
+ for (i = 0; i < half_moves - 1; i++)
+ {
+ emit_move_insn (tmp[i], srcmem);
+ srcmem = offset_address (srcmem,
+ GEN_INT (GET_MODE_SIZE (mode)),
+ GET_MODE_SIZE (mode));
+ }
+ emit_move_insn (tmp[i], srcmem);
+
+ /* Load the second half. */
+ srcmem = offset_address (base_srcmem, count_exp, 1);
+ srcmem = offset_address (srcmem,
+ GEN_INT (-GET_MODE_SIZE (mode)),
+ GET_MODE_SIZE (mode));
+ for (j = half_moves, i = 0; i < half_moves - 1; i++, j++)
+ {
+ emit_move_insn (tmp[j], srcmem);
+ srcmem = offset_address (srcmem,
+ GEN_INT (-GET_MODE_SIZE (mode)),
+ GET_MODE_SIZE (mode));
+ }
+ emit_move_insn (tmp[j], srcmem);
+
+ rtx base_destmem = change_address (dst, mode, destreg);
+
+ /* Store the first half. */
+ rtx destmem = base_destmem;
+ for (i = 0; i < half_moves - 1; i++)
+ {
+ emit_move_insn (destmem, tmp[i]);
+ destmem = offset_address (destmem,
+ GEN_INT (GET_MODE_SIZE (mode)),
+ GET_MODE_SIZE (mode));
+ }
+ emit_move_insn (destmem, tmp[i]);
+
+ /* Store the second half. */
+ destmem = offset_address (base_destmem, count_exp, 1);
+ destmem = offset_address (destmem, GEN_INT (-GET_MODE_SIZE (mode)),
+ GET_MODE_SIZE (mode));
+ for (j = half_moves, i = 0; i < half_moves - 1; i++, j++)
+ {
+ emit_move_insn (destmem, tmp[j]);
+ destmem = offset_address (destmem, GEN_INT (-GET_MODE_SIZE (mode)),
+ GET_MODE_SIZE (mode));
+ }
+ emit_move_insn (destmem, tmp[j]);
+}
+
+/* Expand memmove of size < mode size which is <= 64. */
+
+static void
+ix86_expand_less_move_movmem (rtx dst, rtx src, rtx destreg,
+ rtx srcreg, rtx count_exp,
+ unsigned HOST_WIDE_INT min_size,
+ machine_mode mode,
+ rtx_code_label *done_label)
+{
+ bool skip = false;
+ machine_mode count_mode = counter_mode (count_exp);
+
+ rtx_code_label *between_32_63_label
+ = GET_MODE_SIZE (mode) > 32 ? gen_label_rtx () : nullptr;
+ /* Jump to BETWEEN_32_64_LABEL if size >= 32 and size < 64. */
+ if (between_32_63_label)
+ {
+ if (min_size && min_size >= 32)
+ {
+ emit_jump_insn (gen_jump (between_32_63_label));
+ emit_barrier ();
+ skip = true;
+ }
+ else
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (32), GEU,
+ nullptr, count_mode, 1,
+ between_32_63_label);
+ }
+
+ rtx_code_label *between_16_31_label
+ = (!skip && GET_MODE_SIZE (mode) > 16) ? gen_label_rtx () : nullptr;
+ /* Jump to BETWEEN_16_31_LABEL if size >= 16 and size < 31. */
+ if (between_16_31_label)
+ {
+ if (min_size && min_size >= 16)
+ {
+ emit_jump_insn (gen_jump (between_16_31_label));
+ emit_barrier ();
+ skip = true;
+ }
+ else
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (16), GEU,
+ nullptr, count_mode, 1,
+ between_16_31_label);
+ }
+
+ rtx_code_label *between_8_15_label
+ = (!skip && GET_MODE_SIZE (mode) > 8) ? gen_label_rtx () : nullptr;
+ /* Jump to BETWEEN_8_15_LABEL if size >= 8 and size < 15. */
+ if (between_8_15_label)
+ {
+ if (min_size && min_size >= 8)
+ {
+ emit_jump_insn (gen_jump (between_8_15_label));
+ emit_barrier ();
+ skip = true;
+ }
+ else
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (8), GEU,
+ nullptr, count_mode, 1,
+ between_8_15_label);
+ }
+
+ rtx_code_label *between_4_7_label
+ = (!skip && GET_MODE_SIZE (mode) > 4) ? gen_label_rtx () : nullptr;
+ /* Jump to BETWEEN_4_7_LABEL if size >= 4 and size < 7. */
+ if (between_4_7_label)
+ {
+ if (min_size && min_size >= 4)
+ {
+ emit_jump_insn (gen_jump (between_4_7_label));
+ emit_barrier ();
+ skip = true;
+ }
+ else
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (4), GEU,
+ nullptr, count_mode, 1,
+ between_4_7_label);
+ }
+
+ rtx_code_label *between_2_3_label
+ = (!skip && GET_MODE_SIZE (mode) > 2) ? gen_label_rtx () : nullptr;
+ /* Jump to BETWEEN_2_3_LABEL if size >= 2 and size < 3. */
+ if (between_2_3_label)
+ {
+ if (min_size && min_size >= 2)
+ {
+ emit_jump_insn (gen_jump (between_2_3_label));
+ emit_barrier ();
+ skip = true;
+ }
+ else
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (1), GT,
+ nullptr, count_mode, 1,
+ between_2_3_label);
+ }
+
+ if (!skip)
+ {
+ rtx_code_label *zero_label
+ = min_size == 0 ? gen_label_rtx () : nullptr;
+ /* Skip if size == 0. */
+ if (zero_label)
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (1), LT,
+ nullptr, count_mode, 1,
+ zero_label,
+ profile_probability::unlikely ());
+
+ /* Move 1 byte. */
+ rtx tmp0 = gen_reg_rtx (QImode);
+ rtx srcmem = change_address (src, QImode, srcreg);
+ emit_move_insn (tmp0, srcmem);
+ rtx destmem = change_address (dst, QImode, destreg);
+ emit_move_insn (destmem, tmp0);
+
+ if (zero_label)
+ emit_label (zero_label);
+
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+ }
+
+ if (between_32_63_label)
+ {
+ emit_label (between_32_63_label);
+ ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
+ count_exp, OImode, 2);
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+ }
+
+ if (between_16_31_label)
+ {
+ emit_label (between_16_31_label);
+ ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
+ count_exp, TImode, 2);
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+ }
+
+ if (between_8_15_label)
+ {
+ emit_label (between_8_15_label);
+ ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
+ count_exp, DImode, 2);
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+ }
+
+ if (between_4_7_label)
+ {
+ emit_label (between_4_7_label);
+ ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
+ count_exp, SImode, 2);
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+ }
+
+ if (between_2_3_label)
+ {
+ emit_label (between_2_3_label);
+ ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
+ count_exp, HImode, 2);
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+ }
+}
+
+/* Expand movmem with overlapping unaligned loads and stores:
+ 1. Load all sources into registers and store them together to avoid
+ possible address overlap between source and destination.
+ 2. For known size, first try to fully unroll with 8 registers.
+ 3. For size <= 2 * MOVE_MAX, load all sources into 2 registers first
+ and then store them together.
+ 4. For size > 2 * MOVE_MAX and size <= 4 * MOVE_MAX, load all sources
+ into 4 registers first and then store them together.
+ 5. For size > 4 * MOVE_MAX and size <= 8 * MOVE_MAX, load all sources
+ into 8 registers first and then store them together.
+ 6. For size > 8 * MOVE_MAX,
+ a. If address of destination > address of source, copy backward
+ with a 4 * MOVE_MAX loop with unaligned loads and stores. Load
+ the first 4 * MOVE_MAX into 4 registers before the loop and
+ store them after the loop to support overlapping addresses.
+ b. Otherwise, copy forward with a 4 * MOVE_MAX loop with unaligned
+ loads and stores. Load the last 4 * MOVE_MAX into 4 registers
+ before the loop and store them after the loop to support
+ overlapping addresses.
+ */
+
+bool
+ix86_expand_movmem (rtx operands[])
+{
+ /* Since there are much less registers available in 32-bit mode, don't
+ inline movmem in 32-bit mode. */
+ if (!TARGET_64BIT)
+ return false;
+
+ rtx dst = operands[0];
+ rtx src = operands[1];
+ rtx count_exp = operands[2];
+ rtx expected_size_exp = operands[5];
+ rtx min_size_exp = operands[6];
+ rtx probable_max_size_exp = operands[8];
+ unsigned HOST_WIDE_INT count = HOST_WIDE_INT_0U;
+ HOST_WIDE_INT expected_size = HOST_WIDE_INT_M1U;
+ unsigned HOST_WIDE_INT min_size = HOST_WIDE_INT_0U;
+ unsigned HOST_WIDE_INT probable_max_size = HOST_WIDE_INT_M1U;
+
+ if (CONST_INT_P (count_exp))
+ {
+ min_size = probable_max_size = count = expected_size
+ = INTVAL (count_exp);
+ /* When COUNT is 0, there is nothing to do. */
+ if (!count)
+ return true;
+ }
+ else
+ {
+ if (min_size_exp)
+ min_size = INTVAL (min_size_exp);
+ if (probable_max_size_exp)
+ probable_max_size = INTVAL (probable_max_size_exp);
+ if (CONST_INT_P (expected_size_exp))
+ expected_size = INTVAL (expected_size_exp);
+ }
+
+ /* Make sure we don't need to care about overflow later on. */
+ if (count > (HOST_WIDE_INT_1U << 30))
+ return false;
+
+ addr_space_t dst_as = MEM_ADDR_SPACE (dst);
+ addr_space_t src_as = MEM_ADDR_SPACE (src);
+ int dynamic_check;
+ bool noalign;
+ enum stringop_alg alg = decide_alg (count, expected_size, min_size,
+ probable_max_size, false, false,
+ dst_as, src_as, &dynamic_check,
+ &noalign, false);
+ if (alg == libcall)
+ return false;
+
+ rtx destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
+ rtx srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
+
+ unsigned int move_max = MOVE_MAX;
+ machine_mode mode = smallest_int_mode_for_size
+ (move_max * BITS_PER_UNIT).require ();
+ if (probable_max_size && probable_max_size < move_max)
+ {
+ /* Get a usable MOVE_MAX. */
+ mode = smallest_int_mode_for_size
+ (probable_max_size * BITS_PER_UNIT).require ();
+ /* Reduce MOVE_MAX by half so that MOVE_MAX can be used. */
+ if (GET_MODE_SIZE (mode) > probable_max_size)
+ mode = smallest_int_mode_for_size
+ (GET_MODE_BITSIZE (mode) / 2).require ();
+ move_max = GET_MODE_SIZE (mode);
+ }
+
+ /* Try to fully unroll memmove of known size first. */
+ if (count
+ && ix86_expand_unroll_movmem (dst, src, destreg, srcreg, count,
+ mode))
+ return true;
+
+ rtx_code_label *done_label = gen_label_rtx ();
+
+ rtx_code_label *less_vec_label = nullptr;
+ if (min_size == 0 || min_size < move_max)
+ less_vec_label = gen_label_rtx ();
+
+ machine_mode count_mode = counter_mode (count_exp);
+
+ /* Jump to LESS_VEC_LABEL if size < MOVE_MAX. */
+ if (less_vec_label)
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (move_max), LTU,
+ nullptr, count_mode, 1,
+ less_vec_label);
+
+ rtx_code_label *more_2x_vec_label = nullptr;
+ if (probable_max_size == 0 || probable_max_size > 2 * move_max)
+ more_2x_vec_label = gen_label_rtx ();
+
+ /* Jump to MORE_2X_VEC_LABEL if size > 2 * MOVE_MAX. */
+ if (more_2x_vec_label)
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (2 * move_max), GTU,
+ nullptr, count_mode, 1,
+ more_2x_vec_label);
+
+ if (min_size == 0 || min_size <= 2 * move_max)
+ {
+ /* Size >= MOVE_MAX and size <= 2 * MOVE_MAX. */
+ ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
+ count_exp, mode, 2);
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+ }
+
+ if (less_vec_label)
+ {
+ /* Size < MOVE_MAX. */
+ emit_label (less_vec_label);
+ ix86_expand_less_move_movmem (dst, src, destreg, srcreg,
+ count_exp, min_size, mode,
+ done_label);
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+ }
+
+ if (more_2x_vec_label)
+ {
+ /* Size > 2 * MOVE_MAX and destination may overlap with source. */
+ emit_label (more_2x_vec_label);
+
+ rtx_code_label *more_8x_vec_label = nullptr;
+ if (probable_max_size == 0 || probable_max_size > 8 * move_max)
+ more_8x_vec_label = gen_label_rtx ();
+
+ /* Jump to MORE_8X_VEC_LABEL if size > 8 * MOVE_MAX. */
+ if (more_8x_vec_label)
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (8 * move_max), GTU,
+ nullptr, count_mode, 1,
+ more_8x_vec_label);
+
+ rtx_code_label *last_4x_vec_label = nullptr;
+ if (min_size == 0 || min_size < 4 * move_max)
+ last_4x_vec_label = gen_label_rtx ();
+
+ /* Jump to LAST_4X_VEC_LABEL if size < 4 * MOVE_MAX. */
+ if (last_4x_vec_label)
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (4 * move_max), LTU,
+ nullptr, count_mode, 1,
+ last_4x_vec_label);
+
+ if (probable_max_size == 0 || probable_max_size > 4 * move_max)
+ {
+ /* Size > 4 * MOVE_MAX and size <= 8 * MOVE_MAX. */
+ ix86_expand_n_overlapping_move_movmem (dst, src, destreg,
+ srcreg, count_exp,
+ mode, 8);
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+ }
+
+ if (last_4x_vec_label)
+ {
+ /* Size > 2 * MOVE_MAX and size <= 4 * MOVE_MAX. */
+ emit_label (last_4x_vec_label);
+ ix86_expand_n_overlapping_move_movmem (dst, src, destreg,
+ srcreg, count_exp,
+ mode, 4);
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+ }
+
+ if (more_8x_vec_label)
+ {
+ /* Size > 8 * MOVE_MAX. */
+ emit_label (more_8x_vec_label);
+
+ rtx loop_count = gen_reg_rtx (count_mode);
+ emit_move_insn (loop_count, count_exp);
+
+ /* Jump to MORE_8X_VEC_BACKWARD_LABEL if source address is
+ lower than destination address. */
+ rtx_code_label *more_8x_vec_backward_label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (srcreg, destreg, LTU, nullptr,
+ GET_MODE (destreg), 1,
+ more_8x_vec_backward_label);
+
+ /* Skip if source == destination which is less common. */
+ emit_cmp_and_jump_insns (srcreg, destreg, EQ, nullptr,
+ GET_MODE (destreg), 1, done_label,
+ profile_probability::unlikely ());
+
+ rtx base_destreg = gen_reg_rtx (GET_MODE (destreg));
+ emit_move_insn (base_destreg, destreg);
+
+ /* Load the last 4 * MOVE_MAX. */
+ rtx regs[4];
+ ix86_expand_load_movmem (src, srcreg, count_exp, mode,
+ ARRAY_SIZE (regs), regs, true);
+
+ rtx srcmem = change_address (src, mode, srcreg);
+ rtx destmem = change_address (dst, mode, destreg);
+
+ /* Copy forward with a 4 * MOVE_MAX loop. */
+ rtx_code_label *loop_4x_vec_forward_label = gen_label_rtx ();
+ emit_label (loop_4x_vec_forward_label);
+
+ ix86_expand_n_move_movmem (destmem, srcmem, mode, 4, true);
+
+ rtx tmp;
+ rtx delta = GEN_INT (4 * MOVE_MAX);
+
+ /* Decrement LOOP_COUNT by 4 * MOVE_MAX. */
+ tmp = expand_simple_binop (GET_MODE (loop_count), MINUS,
+ loop_count, delta, nullptr, 1,
+ OPTAB_DIRECT);
+ if (tmp != loop_count)
+ emit_move_insn (loop_count, tmp);
+
+ /* Increment DESTREG and SRCREG by 4 * MOVE_MAX. */
+ tmp = expand_simple_binop (GET_MODE (destreg), PLUS,
+ destreg, delta, nullptr, 1,
+ OPTAB_DIRECT);
+ if (tmp != destreg)
+ emit_move_insn (destreg, tmp);
+ tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, srcreg,
+ delta, nullptr, 1, OPTAB_DIRECT);
+ if (tmp != srcreg)
+ emit_move_insn (srcreg, tmp);
+
+ /* Stop if LOOP_EXP <= 4 * MOVE_MAX. */
+ emit_cmp_and_jump_insns (loop_count, delta, GTU, nullptr,
+ GET_MODE (loop_count), 1,
+ loop_4x_vec_forward_label);
+
+ /* Store the last 4 * MOVE_MAX. */
+ ix86_expand_store_movmem (dst, base_destreg, count_exp, mode,
+ ARRAY_SIZE (regs), regs, true);
+
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+
+ /* Copy backward with a 4 * MOVE_MAX loop. */
+ emit_label (more_8x_vec_backward_label);
+
+ base_destreg = gen_reg_rtx (GET_MODE (destreg));
+ emit_move_insn (base_destreg, destreg);
+
+ /* Load the first 4 * MOVE_MAX. */
+ ix86_expand_load_movmem (src, srcreg, count_exp, mode,
+ ARRAY_SIZE (regs), regs, false);
+
+ /* Increment DESTREG and SRCREG by COUNT_EXP. */
+ tmp = expand_simple_binop (GET_MODE (destreg), PLUS,
+ destreg, count_exp, nullptr, 1,
+ OPTAB_DIRECT);
+ if (tmp != destreg)
+ emit_move_insn (destreg, tmp);
+ tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, srcreg,
+ count_exp, nullptr, 1, OPTAB_DIRECT);
+ if (tmp != srcreg)
+ emit_move_insn (srcreg, tmp);
+
+ srcmem = change_address (src, mode, srcreg);
+ destmem = change_address (dst, mode, destreg);
+ rtx step = GEN_INT (-GET_MODE_SIZE (mode));
+ srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
+ destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
+
+ rtx_code_label *loop_4x_vec_backward_label = gen_label_rtx ();
+ emit_label (loop_4x_vec_backward_label);
+
+ ix86_expand_n_move_movmem (destmem, srcmem, mode, 4, false);
+
+ /* Decrement LOOP_COUNT by 4 * MOVE_MAX. */
+ tmp = expand_simple_binop (GET_MODE (loop_count), MINUS,
+ loop_count, delta, nullptr, 1,
+ OPTAB_DIRECT);
+ if (tmp != loop_count)
+ emit_move_insn (loop_count, tmp);
+
+ /* Decrement DESTREG and SRCREG by 4 * MOVE_MAX. */
+ tmp = expand_simple_binop (GET_MODE (destreg), MINUS,
+ destreg, delta, nullptr, 1,
+ OPTAB_DIRECT);
+ if (tmp != destreg)
+ emit_move_insn (destreg, tmp);
+ tmp = expand_simple_binop (GET_MODE (srcreg), MINUS, srcreg,
+ delta, nullptr, 1, OPTAB_DIRECT);
+ if (tmp != srcreg)
+ emit_move_insn (srcreg, tmp);
+
+ /* Stop if LOOP_EXP <= 4 * MOVE_MAX. */
+ emit_cmp_and_jump_insns (loop_count, delta, GTU, nullptr,
+ GET_MODE (loop_count), 1,
+ loop_4x_vec_backward_label);
+
+ /* Store the first 4 * MOVE_MAX. */
+ ix86_expand_store_movmem (dst, base_destreg, count_exp, mode,
+ ARRAY_SIZE (regs), regs, false);
+
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+ }
+ }
+
+ emit_label (done_label);
+
+ return true;
+}
+
/* Expand cmpstrn or memcmp. */
bool
@@ -26377,17 +27130,15 @@ ix86_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
struct expand_operand ops[5];
int dfv;
- push_to_sequence (*prep_seq);
- expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
-
- cmp_mode = op_mode = GET_MODE (op0);
+ /* Exit early for non integer modes to avoid O(n^2) part of expand_operands. */
+ cmp_mode = op_mode = TYPE_MODE (TREE_TYPE (treeop0));
if (!(op_mode == DImode || op_mode == SImode || op_mode == HImode
|| op_mode == QImode))
- {
- end_sequence ();
- return NULL_RTX;
- }
+ return NULL_RTX;
+
+ push_to_sequence (*prep_seq);
+ expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
icode = code_for_ccmp (op_mode);
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 8e27784..ce6f40b 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3947,12 +3947,20 @@ ix86_emit_tls_call (rtx tls_set, x86_cse_kind kind, basic_block bb,
(note 2 3 5 2 NOTE_INSN_FUNCTION_BEG)
(debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil))
+ or a basic block with only deleted instructions:
+
+ (code_label 348 23 349 45 3 (nil) [0 uses])
+ (note 349 348 436 45 [bb 45] NOTE_INSN_BASIC_BLOCK)
+ (note 436 349 362 45 NOTE_INSN_DELETED)
+
*/
gcc_assert (DEBUG_INSN_P (insn)
|| (NOTE_P (insn)
&& ((NOTE_KIND (insn)
== NOTE_INSN_FUNCTION_BEG)
|| (NOTE_KIND (insn)
+ == NOTE_INSN_DELETED)
+ || (NOTE_KIND (insn)
== NOTE_INSN_BASIC_BLOCK))));
insn = NULL;
break;
@@ -4810,6 +4818,9 @@ pass_x86_cse::x86_cse (void)
df_process_deferred_rescans ();
}
+ FOR_EACH_VEC_ELT (loads, i, load)
+ delete load;
+
df_clear_flags (DF_DEFER_INSN_RESCAN);
timevar_pop (TV_MACH_DEP);
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index dadcf76..35064d8 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -1837,6 +1837,21 @@ set_ix86_tune_features (struct gcc_options *opts,
}
parse_mtune_ctrl_str (opts, dump);
+
+ /* mgather/mscatter option would overwrite -mtune-crtl option. */
+ if (OPTION_SET_P (ix86_use_gather))
+ {
+ ix86_tune_features[X86_TUNE_USE_GATHER_2PARTS] = ix86_use_gather;
+ ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS] = ix86_use_gather;
+ ix86_tune_features[X86_TUNE_USE_GATHER_8PARTS] = ix86_use_gather;
+ }
+
+ if (OPTION_SET_P (ix86_use_scatter))
+ {
+ ix86_tune_features[X86_TUNE_USE_SCATTER_2PARTS] = ix86_use_scatter;
+ ix86_tune_features[X86_TUNE_USE_SCATTER_4PARTS] = ix86_use_scatter;
+ ix86_tune_features[X86_TUNE_USE_SCATTER_8PARTS] = ix86_use_scatter;
+ }
}
@@ -2917,7 +2932,7 @@ ix86_option_override_internal (bool main_args_p,
else
{
opts->x_ix86_move_max = opts->x_prefer_vector_width_type;
- if (opts_set->x_ix86_move_max == PVW_NONE)
+ if (opts->x_ix86_move_max == PVW_NONE)
{
if (TARGET_AVX512F_P (opts->x_ix86_isa_flags))
opts->x_ix86_move_max = PVW_AVX512;
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index bdb8bb9..5ff414a 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -78,6 +78,7 @@ extern void substitute_vpternlog_operands (rtx[]);
extern bool ix86_expand_strlen (rtx, rtx, rtx, rtx);
extern bool ix86_expand_set_or_cpymem (rtx, rtx, rtx, rtx, rtx, rtx,
rtx, rtx, rtx, rtx, bool);
+extern bool ix86_expand_movmem (rtx[]);
extern bool ix86_expand_cmpstrn_or_cmpmem (rtx, rtx, rtx, rtx, rtx, bool);
extern enum reg_class ix86_insn_base_reg_class (rtx_insn *);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 587b2bd..75a9cb6 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -598,6 +598,20 @@ ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
}
}
+ /* SUB (a, b) underflows precisely when a < b. Convert
+ (compare (minus (a b)) a) to (compare (a b))
+ to match *sub<mode>_3 pattern. */
+ if (!op0_preserve_value
+ && (*code == GTU || *code == LEU)
+ && GET_CODE (*op0) == MINUS
+ && rtx_equal_p (XEXP (*op0, 0), *op1))
+ {
+ *op1 = XEXP (*op0, 1);
+ *op0 = XEXP (*op0, 0);
+ *code = (int) swap_condition ((enum rtx_code) *code);
+ return;
+ }
+
/* Swap operands of GTU comparison to canonicalize
addcarry/subborrow comparison. */
if (!op0_preserve_value
@@ -23753,9 +23767,15 @@ x86_print_call_or_nop (FILE *file, const char *target,
const char *label)
{
if (flag_nop_mcount || !strcmp (target, "nop"))
- /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
- fprintf (file, "%s" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n",
- label);
+ {
+ if (TARGET_16BIT)
+ /* 3 byte no-op: lea 0(%si), %si */
+ fprintf (file, "%s" ASM_BYTE "0x8d, 0x74, 0x00\n", label);
+ else
+ /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
+ fprintf (file, "%s" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n",
+ label);
+ }
else if (!TARGET_PECOFF && flag_pic)
{
gcc_assert (flag_plt);
@@ -25089,7 +25109,7 @@ i386_solaris_elf_named_section (const char *name, unsigned int flags,
return;
}
-#ifndef USE_GAS
+#if !HAVE_GNU_AS
if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
{
solaris_elf_asm_comdat_section (name, flags, decl);
@@ -26377,7 +26397,20 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
(TREE_OPERAND (gimple_assign_rhs1 (def), 0))))))
{
if (fp)
- m_num_sse_needed[where]++;
+ {
+ /* Scalar FP values residing in x87 registers need to be
+ spilled and reloaded. */
+ auto mode2 = TYPE_MODE (TREE_TYPE (op));
+ if (IS_STACK_MODE (mode2))
+ {
+ int cost
+ = (ix86_cost->hard_register.fp_store[mode2 == SFmode
+ ? 0 : 1]
+ + ix86_cost->sse_load[sse_store_index (mode2)]);
+ stmt_cost += COSTS_N_INSNS (cost) / 2;
+ }
+ m_num_sse_needed[where]++;
+ }
else
{
m_num_gpr_needed[where]++;
@@ -26595,6 +26628,11 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs)
if (loop_vinfo
&& !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
&& LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant () > 2
+ /* Avoid a masked epilog if cascaded epilogues eventually get us
+ to one with VF 1 as that means no scalar epilog at all. */
+ && !((GET_MODE_SIZE (loop_vinfo->vector_mode)
+ / LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant () == 16)
+ && ix86_tune_features[X86_TUNE_AVX512_TWO_EPILOGUES])
&& ix86_tune_features[X86_TUNE_AVX512_MASKED_EPILOGUES]
&& !OPTION_SET_P (param_vect_partial_vector_usage))
{
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 94f335f..b934117 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -2488,7 +2488,11 @@ constexpr wide_int_bitmask PTA_DIAMONDRAPIDS = PTA_GRANITERAPIDS_D
| PTA_CMPCCXADD | PTA_SHA512 | PTA_SM3 | PTA_SM4 | PTA_AVX10_2
| PTA_APX_F | PTA_AMX_AVX512 | PTA_AMX_FP8 | PTA_AMX_TF32 | PTA_MOVRS
| PTA_AMX_MOVRS;
-constexpr wide_int_bitmask PTA_NOVALAKE = PTA_PANTHERLAKE | PTA_PREFETCHI;
+constexpr wide_int_bitmask PTA_NOVALAKE = PTA_PANTHERLAKE | PTA_PREFETCHI
+ | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL | PTA_AVX512BW | PTA_AVX512DQ
+ | PTA_AVX512VBMI | PTA_AVX512IFMA | PTA_AVX512VNNI | PTA_AVX512VBMI2
+ | PTA_AVX512BITALG | PTA_AVX512VPOPCNTDQ | PTA_AVX512FP16 | PTA_AVX512BF16
+ | PTA_AVX10_1 | PTA_AVX10_2 | PTA_APX_F | PTA_MOVRS;
constexpr wide_int_bitmask PTA_BDVER1 = PTA_64BIT | PTA_MMX | PTA_SSE
| PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_POPCNT | PTA_LZCNT
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 218377a..df7135f 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -8642,7 +8642,7 @@
[(set (reg FLAGS_REG)
(compare (match_operand:SWI 1 "nonimmediate_operand" "0,0,rm,r")
(match_operand:SWI 2 "<general_operand>" "<r><i>,<m>,r<i>,<m>")))
- (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m,<r>i,r,r")
+ (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m,<r>,r,r")
(minus:SWI (match_dup 1) (match_dup 2)))]
"ix86_match_ccmode (insn, CCmode)
&& ix86_binary_operator_ok (MINUS, <MODE>mode, operands, TARGET_APX_NDD)"
@@ -8860,6 +8860,35 @@
(match_dup 0)))
(clobber (reg:CC FLAGS_REG))])])
+(define_insn "*add<mode>3_carry_2"
+ [(set (reg FLAGS_REG)
+ (compare
+ (plus:SWI
+ (plus:SWI
+ (match_operator:SWI 4 "ix86_carry_flag_operator"
+ [(match_operand 3 "flags_reg_operand") (const_int 0)])
+ (match_operand:SWI 1 "nonimmediate_operand" "%0,0,rm,r"))
+ (match_operand:SWI 2 "<general_operand>" "<r><i>,<m>,r<i>,<m>"))
+ (const_int 0)))
+ (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m,<r>,r,r")
+ (plus:SWI
+ (plus:SWI
+ (match_op_dup 4 [(match_dup 3) (const_int 0)])
+ (match_dup 1))
+ (match_dup 2)))]
+ "ix86_match_ccmode (insn, CCGOCmode)
+ && ix86_binary_operator_ok (PLUS, <MODE>mode, operands, TARGET_APX_NDD)"
+ "@
+ adc{<imodesuffix>}\t{%2, %0|%0, %2}
+ adc{<imodesuffix>}\t{%2, %0|%0, %2}
+ adc{<imodesuffix>}\t{%2, %1, %0|%0, %1, %2}
+ adc{<imodesuffix>}\t{%2, %1, %0|%0, %1, %2}"
+ [(set_attr "isa" "*,*,apx_ndd,apx_ndd")
+ (set_attr "type" "alu")
+ (set_attr "use_carry" "1")
+ (set_attr "pent_pair" "pu")
+ (set_attr "mode" "<MODE>")])
+
(define_insn "*add<mode>3_carry_0"
[(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
(plus:SWI
@@ -8874,6 +8903,26 @@
(set_attr "pent_pair" "pu")
(set_attr "mode" "<MODE>")])
+(define_insn "*add<mode>3_carry_0_cc"
+ [(set (reg FLAGS_REG)
+ (compare
+ (plus:SWI
+ (match_operator:SWI 2 "ix86_carry_flag_operator"
+ [(match_operand 3 "flags_reg_operand") (const_int 0)])
+ (match_operand:SWI 1 "nonimmediate_operand" "0"))
+ (const_int 0)))
+ (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
+ (plus:SWI
+ (match_op_dup 2 [(match_dup 3) (const_int 0)])
+ (match_dup 1)))]
+ "ix86_match_ccmode (insn, CCGOCmode)
+ && (!MEM_P (operands[0]) || rtx_equal_p (operands[0], operands[1]))"
+ "adc{<imodesuffix>}\t{$0, %0|%0, 0}"
+ [(set_attr "type" "alu")
+ (set_attr "use_carry" "1")
+ (set_attr "pent_pair" "pu")
+ (set_attr "mode" "<MODE>")])
+
(define_insn "*add<mode>3_carry_0r"
[(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
(plus:SWI
@@ -8888,6 +8937,26 @@
(set_attr "pent_pair" "pu")
(set_attr "mode" "<MODE>")])
+(define_insn "*add<mode>3_carry_0r_cc"
+ [(set (reg FLAGS_REG)
+ (compare
+ (plus:SWI
+ (match_operator:SWI 2 "ix86_carry_flag_unset_operator"
+ [(match_operand 3 "flags_reg_operand") (const_int 0)])
+ (match_operand:SWI 1 "nonimmediate_operand" "0"))
+ (const_int 0)))
+ (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
+ (plus:SWI
+ (match_op_dup 2 [(match_dup 3) (const_int 0)])
+ (match_dup 1)))]
+ "ix86_match_ccmode (insn, CCGOCmode)
+ && (!MEM_P (operands[0]) || rtx_equal_p (operands[0], operands[1]))"
+ "sbb{<imodesuffix>}\t{$-1, %0|%0, -1}"
+ [(set_attr "type" "alu")
+ (set_attr "use_carry" "1")
+ (set_attr "pent_pair" "pu")
+ (set_attr "mode" "<MODE>")])
+
(define_insn "*addqi3_carry_zext<mode>"
[(set (match_operand:SWI248x 0 "register_operand" "=r,r")
(zero_extend:SWI248x
@@ -9456,6 +9525,35 @@
(match_dup 0)))
(clobber (reg:CC FLAGS_REG))])])
+(define_insn "*sub<mode>3_carry_2"
+ [(set (reg FLAGS_REG)
+ (compare
+ (minus:SWI
+ (minus:SWI
+ (match_operand:SWI 1 "nonimmediate_operand" "0,0,rm,r")
+ (match_operator:SWI 4 "ix86_carry_flag_operator"
+ [(match_operand 3 "flags_reg_operand") (const_int 0)]))
+ (match_operand:SWI 2 "<general_operand>" "<r><i>,<m>,r<i>,<m>"))
+ (const_int 0)))
+ (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m,<r>,r,r")
+ (minus:SWI
+ (minus:SWI
+ (match_dup 1)
+ (match_op_dup 4 [(match_dup 3) (const_int 0)]))
+ (match_dup 2)))]
+ "ix86_match_ccmode (insn, CCGOCmode)
+ && ix86_binary_operator_ok (MINUS, <MODE>mode, operands, TARGET_APX_NDD)"
+ "@
+ sbb{<imodesuffix>}\t{%2, %0|%0, %2}
+ sbb{<imodesuffix>}\t{%2, %0|%0, %2}
+ sbb{<imodesuffix>}\t{%2, %1, %0|%0, %1, %2}
+ sbb{<imodesuffix>}\t{%2, %1, %0|%0, %1, %2}"
+ [(set_attr "isa" "*,*,apx_ndd,apx_ndd")
+ (set_attr "type" "alu")
+ (set_attr "use_carry" "1")
+ (set_attr "pent_pair" "pu")
+ (set_attr "mode" "<MODE>")])
+
(define_insn "*sub<mode>3_carry_0"
[(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
(minus:SWI
@@ -9470,6 +9568,26 @@
(set_attr "pent_pair" "pu")
(set_attr "mode" "<MODE>")])
+(define_insn "*sub<mode>3_carry_0_cc"
+ [(set (reg FLAGS_REG)
+ (compare
+ (minus:SWI
+ (match_operand:SWI 1 "nonimmediate_operand" "0")
+ (match_operator:SWI 2 "ix86_carry_flag_operator"
+ [(match_operand 3 "flags_reg_operand") (const_int 0)]))
+ (const_int 0)))
+ (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
+ (minus:SWI
+ (match_dup 1)
+ (match_op_dup 2 [(match_dup 3) (const_int 0)])))]
+ "ix86_match_ccmode (insn, CCGOCmode)
+ && (!MEM_P (operands[0]) || rtx_equal_p (operands[0], operands[1]))"
+ "sbb{<imodesuffix>}\t{$0, %0|%0, 0}"
+ [(set_attr "type" "alu")
+ (set_attr "use_carry" "1")
+ (set_attr "pent_pair" "pu")
+ (set_attr "mode" "<MODE>")])
+
(define_insn "*sub<mode>3_carry_0r"
[(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
(minus:SWI
@@ -9484,6 +9602,26 @@
(set_attr "pent_pair" "pu")
(set_attr "mode" "<MODE>")])
+(define_insn "*sub<mode>3_carry_0r_cc"
+ [(set (reg FLAGS_REG)
+ (compare
+ (minus:SWI
+ (match_operand:SWI 1 "nonimmediate_operand" "0")
+ (match_operator:SWI 2 "ix86_carry_flag_unset_operator"
+ [(match_operand 3 "flags_reg_operand") (const_int 0)]))
+ (const_int 0)))
+ (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
+ (minus:SWI
+ (match_dup 1)
+ (match_op_dup 2 [(match_dup 3) (const_int 0)])))]
+ "ix86_match_ccmode (insn, CCGOCmode)
+ && (!MEM_P (operands[0]) || rtx_equal_p (operands[0], operands[1]))"
+ "adc{<imodesuffix>}\t{$-1, %0|%0, -1}"
+ [(set_attr "type" "alu")
+ (set_attr "use_carry" "1")
+ (set_attr "pent_pair" "pu")
+ (set_attr "mode" "<MODE>")])
+
(define_insn "*subqi3_carry_zext<mode>"
[(set (match_operand:SWI248x 0 "register_operand" "=r,r")
(zero_extend:SWI248x
@@ -12213,7 +12351,7 @@
(compare:CCNO
(and:SWI48
(match_operand:SWI48 0 "nonimmediate_operand")
- (match_operand:SWI48 1 "<nonmemory_szext_operand>"))
+ (match_operand:SWI48 1 "<general_szext_operand>"))
(const_int 0)))])
(define_expand "testqi_ccz_1"
@@ -12221,7 +12359,7 @@
(compare:CCZ
(and:QI
(match_operand:QI 0 "nonimmediate_operand")
- (match_operand:QI 1 "nonmemory_operand"))
+ (match_operand:QI 1 "general_operand"))
(const_int 0)))])
(define_insn "*testdi_1"
@@ -12229,7 +12367,7 @@
(compare
(and:DI
(match_operand:DI 0 "nonimmediate_operand" "%r,rm")
- (match_operand:DI 1 "x86_64_szext_nonmemory_operand" "Z,re"))
+ (match_operand:DI 1 "x86_64_szext_general_operand" "Z,re"))
(const_int 0)))]
"TARGET_64BIT
&& ix86_match_ccmode
@@ -12242,7 +12380,8 @@
(satisfies_constraint_Z (operands[1])
&& (!CONST_INT_P (operands[1])
|| val_signbit_known_set_p (SImode, INTVAL (operands[1]))))
- ? CCZmode : CCNOmode)"
+ ? CCZmode : CCNOmode)
+ && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
"@
test{l}\t{%k1, %k0|%k0, %k1}
test{q}\t{%1, %0|%0, %1}"
@@ -12253,12 +12392,13 @@
[(set (reg FLAGS_REG)
(compare
(and:QI
- (match_operand:QI 0 "nonimmediate_operand" "%qm,qm,r")
- (match_operand:QI 1 "nonmemory_operand" "q,n,n"))
+ (match_operand:QI 0 "nonimmediate_operand" "%qm,*a,qm,r")
+ (match_operand:QI 1 "general_operand" "q,n,n,n"))
(const_int 0)))]
"ix86_match_ccmode (insn,
CONST_INT_P (operands[1])
- && INTVAL (operands[1]) >= 0 ? CCNOmode : CCZmode)"
+ && INTVAL (operands[1]) >= 0 ? CCNOmode : CCZmode)
+ && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
{
if (get_attr_mode (insn) == MODE_SI)
{
@@ -12270,7 +12410,7 @@
}
[(set_attr "type" "test")
(set (attr "mode")
- (cond [(eq_attr "alternative" "2")
+ (cond [(eq_attr "alternative" "3")
(const_string "SI")
(and (match_test "optimize_insn_for_size_p ()")
(and (match_operand 0 "ext_QIreg_operand")
@@ -12278,16 +12418,17 @@
(const_string "SI")
]
(const_string "QI")))
- (set_attr "pent_pair" "uv,np,np")])
+ (set_attr "pent_pair" "uv,uv,np,np")])
(define_insn "*test<mode>_1"
[(set (reg FLAGS_REG)
(compare
(and:SWI124
(match_operand:SWI124 0 "nonimmediate_operand" "%<r>m,*a,<r>m")
- (match_operand:SWI124 1 "<nonmemory_szext_operand>" "<r>,<i>,<i>"))
- (const_int 0)))]
- "ix86_match_ccmode (insn, CCNOmode)"
+ (match_operand:SWI124 1 "<general_operand>" "<r>,<i>,<i>"))
+ (const_int 0)))]
+ "ix86_match_ccmode (insn, CCNOmode)
+ && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
"test{<imodesuffix>}\t{%1, %0|%0, %1}"
[(set_attr "type" "test")
(set_attr "mode" "<MODE>")
@@ -14062,6 +14203,22 @@
(set_attr "isa" "*,apx_ndd")
(set_attr "mode" "SI")])
+;; It must be put before *<code><mode>_3, the one below.
+(define_insn "*ior<mode>_ccz_1"
+ [(set (reg:CCZ FLAGS_REG)
+ (compare:CCZ
+ (ior:SWI1248_AVX512BWDQ_64
+ (match_operand:SWI1248_AVX512BWDQ_64 1 "nonimmediate_operand" "%0,?k")
+ (match_operand:SWI1248_AVX512BWDQ_64 2 "<general_operand>" "<g>, k"))
+ (const_int 0)))
+ (clobber (match_scratch:SWI1248_AVX512BWDQ_64 0 "=<r>, X"))]
+ "TARGET_AVX512F && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+ "@
+ or{<imodesuffix>}\t{%2, %0|%0, %2}
+ kortest<mskmodesuffix>\t{%1, %2|%2, %1}"
+ [(set_attr "type" "alu,msklog")
+ (set_attr "mode" "<MODE>")])
+
(define_insn "*<code><mode>_3"
[(set (reg FLAGS_REG)
(compare (any_or:SWI
@@ -25708,6 +25865,23 @@
(set_attr "length_immediate" "0")
(set_attr "modrm" "0")])
+(define_expand "movmem<mode>"
+ [(use (match_operand:BLK 0 "memory_operand"))
+ (use (match_operand:BLK 1 "memory_operand"))
+ (use (match_operand:SWI48 2 "nonmemory_operand"))
+ (use (match_operand:SWI48 3 "const_int_operand"))
+ (use (match_operand:SI 4 "const_int_operand"))
+ (use (match_operand:SI 5 "const_int_operand"))
+ (use (match_operand:SI 6 ""))
+ (use (match_operand:SI 7 ""))
+ (use (match_operand:SI 8 ""))]
+ ""
+{
+ if (ix86_expand_movmem (operands))
+ DONE;
+ FAIL;
+})
+
(define_expand "cpymem<mode>"
[(use (match_operand:BLK 0 "memory_operand"))
(use (match_operand:BLK 1 "memory_operand"))
@@ -27353,6 +27527,72 @@
(match_dup 0))]
"peep2_reg_dead_p (2, operands[0])"
[(set (match_dup 2) (match_dup 1))])
+
+;; umax (a, add (a, b)) => [sum, ovf] = add (a, b); ovf ? a : sum
+;; umin (a, add (a, b)) => [sum, ovf] = add (a, b); ovf ? sum : a
+
+(define_code_attr ovf_add_cmp [(umax "geu") (umin "ltu")])
+
+(define_int_iterator ovf_comm [1 2])
+
+(define_insn_and_split "*plus_within_<code><mode>3_<ovf_comm>"
+ [(set (match_operand:SWI248 0 "register_operand")
+ (umaxmin:SWI248
+ (plus:SWI248 (match_operand:SWI248 1 "nonimmediate_operand")
+ (match_operand:SWI248 2 "<general_operand>"))
+ (match_dup ovf_comm)))
+ (clobber (reg:CC FLAGS_REG))]
+ "TARGET_CMOVE
+ && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(parallel
+ [(set (reg:CCC FLAGS_REG)
+ (compare:CCC
+ (plus:SWI248 (match_dup 1) (match_dup 2))
+ (match_dup ovf_comm)))
+ (set (match_dup 3)
+ (plus:SWI248 (match_dup 1) (match_dup 2)))])
+ (set (match_dup 0)
+ (if_then_else:SWI248
+ (<ovf_add_cmp> (reg:CCC FLAGS_REG) (const_int 0))
+ (match_dup 3)
+ (match_dup ovf_comm)))]
+{
+ operands[<ovf_comm>] = force_reg (<MODE>mode, operands[<ovf_comm>]);
+ operands[3] = gen_reg_rtx (<MODE>mode);
+})
+
+;; umax (a, sub (a, b)) => [diff, udf] = sub (a, b); udf ? diff : a
+;; umin (a, sub (a, b)) => [diff, udf] = sub (a, b); udf ? a : diff
+
+(define_code_attr udf_sub_cmp [(umax "ltu") (umin "geu")])
+
+(define_insn_and_split "*minus_within_<code><mode>3"
+ [(set (match_operand:SWI248 0 "register_operand")
+ (umaxmin:SWI248
+ (minus:SWI248 (match_operand:SWI248 1 "nonimmediate_operand")
+ (match_operand:SWI248 2 "<general_operand>"))
+ (match_dup 1)))
+ (clobber (reg:CC FLAGS_REG))]
+ "TARGET_CMOVE
+ && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(parallel
+ [(set (reg:CC FLAGS_REG)
+ (compare:CC (match_dup 1) (match_dup 2)))
+ (set (match_dup 3)
+ (minus:SWI248 (match_dup 1) (match_dup 2)))])
+ (set (match_dup 0)
+ (if_then_else:SWI248
+ (<udf_sub_cmp> (reg:CC FLAGS_REG) (const_int 0))
+ (match_dup 3)
+ (match_dup 1)))]
+{
+ operands[1] = force_reg (<MODE>mode, operands[1]);
+ operands[3] = gen_reg_rtx (<MODE>mode);
+})
;; Misc patterns (?)
@@ -29471,7 +29711,7 @@
[(match_operand:SI 0 "register_operand")
(match_operand:SI 1 "register_operand")
(match_operand:SWI124 2 "nonimmediate_operand")
- (match_operand:SI 3)]
+ (match_operand:SI 3 "const_int_operand")]
"TARGET_CRC32"
{
/* crc32 uses iSCSI polynomial */
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 8449450..c0093ef 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1290,11 +1290,11 @@ Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and
SM4 built-in functions and code generation.
mgather
-Target Alias(mtune-ctrl=, use_gather, ^use_gather)
+Target Var(ix86_use_gather) Init(0) Optimization.
Enable vectorization for gather instruction.
mscatter
-Target Alias(mtune-ctrl=, use_scatter, ^use_scatter)
+Target Var(ix86_use_scatter) Init(0) Optimization
Enable vectorization for scatter instruction.
mapxf
diff --git a/gcc/config/i386/i386.opt.urls b/gcc/config/i386/i386.opt.urls
index a9bbac0..129d91f 100644
--- a/gcc/config/i386/i386.opt.urls
+++ b/gcc/config/i386/i386.opt.urls
@@ -13,10 +13,10 @@ mlong-double-80
UrlSuffix(gcc/x86-Options.html#index-mlong-double-80)
mlong-double-64
-UrlSuffix(gcc/x86-Options.html#index-mlong-double-64-1)
+UrlSuffix(gcc/x86-Options.html#index-mlong-double-64-2)
mlong-double-128
-UrlSuffix(gcc/x86-Options.html#index-mlong-double-128-1)
+UrlSuffix(gcc/x86-Options.html#index-mlong-double-128-2)
maccumulate-outgoing-args
UrlSuffix(gcc/x86-Options.html#index-maccumulate-outgoing-args-1)
@@ -57,7 +57,7 @@ UrlSuffix(gcc/x86-Options.html#index-mfp-ret-in-387)
; duplicate: 'gcc/x86-Options.html#index-mfpmath-1'
mhard-float
-UrlSuffix(gcc/x86-Options.html#index-mhard-float-11)
+UrlSuffix(gcc/x86-Options.html#index-mhard-float-10)
mieee-fp
UrlSuffix(gcc/x86-Options.html#index-mieee-fp)
@@ -120,7 +120,7 @@ mrtd
UrlSuffix(gcc/x86-Options.html#index-mrtd-1)
msoft-float
-UrlSuffix(gcc/x86-Options.html#index-msoft-float-16)
+UrlSuffix(gcc/x86-Options.html#index-msoft-float-15)
msseregparm
UrlSuffix(gcc/x86-Options.html#index-msseregparm)
@@ -438,7 +438,7 @@ mpku
UrlSuffix(gcc/x86-Options.html#index-mpku)
mstack-protector-guard=
-UrlSuffix(gcc/x86-Options.html#index-mstack-protector-guard-4)
+UrlSuffix(gcc/x86-Options.html#index-mstack-protector-guard-5)
mstack-protector-guard-reg=
UrlSuffix(gcc/x86-Options.html#index-mstack-protector-guard-reg-3)
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 57950d3..2863b3e 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1319,9 +1319,6 @@
(ior (match_operand 0 "nonimmediate_operand")
(match_test "const_vec_duplicate_p (op)")))
-(define_predicate "const_vec_dup_operand"
- (match_test "const_vec_duplicate_p (op)"))
-
;; Return true when OP is either register operand, or any
;; CONST_VECTOR.
(define_predicate "reg_or_const_vector_operand"
@@ -1587,6 +1584,9 @@
(define_predicate "add_comparison_operator"
(match_code "geu,ltu"))
+(define_predicate "ieee_maxmin_comparison_operator"
+ (match_code "lt,gt"))
+
;; Return true if OP is a valid comparison operator in valid mode.
(define_predicate "ix86_comparison_operator"
(match_operand 0 "comparison_operator")
diff --git a/gcc/config/i386/sol2.h b/gcc/config/i386/sol2.h
index 013e87f..3720424 100644
--- a/gcc/config/i386/sol2.h
+++ b/gcc/config/i386/sol2.h
@@ -60,7 +60,7 @@ along with GCC; see the file COPYING3. If not see
/* GNU as understands --32 and --64, but the native Solaris
assembler requires -xarch=generic or -xarch=generic64 instead. */
-#ifdef USE_GAS
+#if HAVE_GNU_AS
#define ASM_CPU32_DEFAULT_SPEC "--32"
#define ASM_CPU64_DEFAULT_SPEC "--64"
#else
@@ -90,16 +90,9 @@ along with GCC; see the file COPYING3. If not see
#define ARCH64_SUBDIR "amd64"
-#ifdef USE_GLD
-/* Since binutils 2.21, GNU ld supports new *_sol2 emulations to strictly
- follow the Solaris 2 ABI. Prefer them if present. */
-#ifdef HAVE_LD_SOL2_EMULATION
+#if HAVE_GNU_LD
#define ARCH32_EMULATION "elf_i386_sol2"
#define ARCH64_EMULATION "elf_x86_64_sol2"
-#else
-#define ARCH32_EMULATION "elf_i386"
-#define ARCH64_EMULATION "elf_x86_64"
-#endif
#endif
#define ENDFILE_ARCH_SPEC \
@@ -156,7 +149,7 @@ along with GCC; see the file COPYING3. If not see
} \
} while (0)
-#ifndef USE_GAS
+#if !HAVE_GNU_AS
/* The Sun assembler uses .tcomm for TLS common sections. */
#define TLS_COMMON_ASM_OP ".tcomm"
@@ -186,7 +179,7 @@ along with GCC; see the file COPYING3. If not see
ASM_OUTPUT_LABEL (FILE, NAME); \
} \
while (0)
-#endif /* !USE_GAS */
+#endif /* !HAVE_GNU_AS */
/* As in sparc/sol2.h, override the default from i386/x86-64.h to work
around Sun as TLS bug. */
@@ -217,13 +210,13 @@ along with GCC; see the file COPYING3. If not see
/* Sun as requires "h" flag for large sections, GNU as can do without, but
accepts "l". */
-#ifdef USE_GAS
+#if HAVE_GNU_AS
#define MACH_DEP_SECTION_ASM_FLAG 'l'
#else
#define MACH_DEP_SECTION_ASM_FLAG 'h'
#endif
-#ifndef USE_GAS
+#if !HAVE_GNU_AS
/* Emit COMDAT group signature symbols for Sun as. */
#undef TARGET_ASM_FILE_END
#define TARGET_ASM_FILE_END solaris_file_end
@@ -231,12 +224,12 @@ along with GCC; see the file COPYING3. If not see
/* Unlike GNU ld, Sun ld doesn't coalesce .ctors.N/.dtors.N sections, so
inhibit their creation. Also cf. sparc/sysv4.h. */
-#ifndef USE_GLD
+#if !HAVE_GNU_LD
#define CTORS_SECTION_ASM_OP "\t.section\t.ctors, \"aw\""
#define DTORS_SECTION_ASM_OP "\t.section\t.dtors, \"aw\""
#endif
-#ifndef USE_GAS
+#if !HAVE_GNU_AS
#define LARGECOMM_SECTION_ASM_OP "\t.lbcomm\t"
#endif
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 5eba992..fb79b2e 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -643,6 +643,9 @@
(define_mode_iterator VI2_AVX512F
[(V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX2") V8HI])
+(define_mode_iterator VI2_AVX10_2
+ [(V32HI "TARGET_AVX10_2") (V16HI "TARGET_AVX2") V8HI])
+
(define_mode_iterator VI2_AVX512VNNIBW
[(V32HI "TARGET_AVX512BW || TARGET_AVX512VNNI")
(V16HI "TARGET_AVX2") V8HI])
@@ -3335,10 +3338,10 @@
[(set (match_operand:VFH 0 "register_operand")
(vec_merge:VFH
(match_operand:VFH 1 "nonimmediate_operand")
- (match_operand:VFH 2 "nonimmediate_operand")
+ (match_operand:VFH 2 "general_operand")
(unspec:<avx512fmaskmode>
[(match_operand:VFH 3 "nonimmediate_operand")
- (match_operand:VFH 4 "nonimmediate_operand")
+ (match_operand:VFH 4 "general_operand")
(match_operand:SI 5 "const_0_to_31_operand")]
UNSPEC_PCMP)))]
"TARGET_SSE && ix86_pre_reload_split ()
@@ -3349,19 +3352,21 @@
&& (INTVAL (operands[5]) == 1 || INTVAL (operands[5]) == 14)"
"#"
"&& 1"
- [(const_int 0)]
+ [(set (match_dup 0) (match_dup 6))]
{
int u = UNSPEC_IEEE_MIN;
+ rtx tmp = operands[2];
if ((INTVAL (operands[5]) == 1 && rtx_equal_p (operands[1], operands[4]))
|| (INTVAL (operands[5]) == 14 && rtx_equal_p (operands[1], operands[3])))
u = UNSPEC_IEEE_MAX;
if (MEM_P (operands[1]))
operands[1] = force_reg (<MODE>mode, operands[1]);
- rtvec v = gen_rtvec (2, operands[1], operands[2]);
- rtx tmp = gen_rtx_UNSPEC (<MODE>mode, v, u);
- emit_move_insn (operands[0], tmp);
- DONE;
+
+ if (immediate_operand (operands[2], <MODE>mode))
+ tmp = force_reg (<MODE>mode, operands[2]);
+ rtvec v = gen_rtvec (2, operands[1], tmp);
+ operands[6] = gen_rtx_UNSPEC (<MODE>mode, v, u);
})
(define_insn_and_split "*minmax<mode>3_2"
@@ -3380,7 +3385,7 @@
&& rtx_equal_p (operands[2], operands[3])))"
"#"
"&& 1"
- [(const_int 0)]
+ [(set (match_dup 0) (match_dup 5))]
{
int u = UNSPEC_IEEE_MIN;
if (rtx_equal_p (operands[1], operands[3]))
@@ -3389,9 +3394,53 @@
if (MEM_P (operands[2]))
operands[2] = force_reg (<MODE>mode, operands[2]);
rtvec v = gen_rtvec (2, operands[2], operands[1]);
- rtx tmp = gen_rtx_UNSPEC (<MODE>mode, v, u);
- emit_move_insn (operands[0], tmp);
- DONE;
+ operands[5] = gen_rtx_UNSPEC (<MODE>mode, v, u);
+ })
+
+
+(define_insn_and_split "*minmax<mode>3_3"
+ [(set (match_operand:VF_128_256 0 "register_operand")
+ (and:VF_128_256
+ (not:VF_128_256
+ (match_operator:VF_128_256 1 "ieee_maxmin_comparison_operator"
+ [(match_operand:VF_128_256 2 "nonimmediate_operand")
+ (match_operand:VF_128_256 3 "const0_operand")]))
+ (match_operand:VF_128_256 4 "nonimmediate_operand")))]
+ "TARGET_SSE && ix86_pre_reload_split ()
+ && rtx_equal_p (operands[2], operands[4])"
+ "#"
+ "&& 1"
+ [(set (match_dup 0) (match_dup 5))]
+ {
+ int u = UNSPEC_IEEE_MIN;
+ if (GET_CODE (operands[1]) == LT)
+ u = UNSPEC_IEEE_MAX;
+
+ rtx tmp = force_reg (<MODE>mode, operands[3]);
+ rtvec v = gen_rtvec (2, tmp, operands[2]);
+ operands[5] = gen_rtx_UNSPEC (<MODE>mode, v, u);
+ })
+
+(define_insn_and_split "*minmax<mode>3_4"
+ [(set (match_operand:VF_128_256 0 "register_operand")
+ (and:VF_128_256
+ (match_operator:VF_128_256 1 "ieee_maxmin_comparison_operator"
+ [(match_operand:VF_128_256 2 "nonimmediate_operand")
+ (match_operand:VF_128_256 3 "const0_operand")])
+ (match_operand:VF_128_256 4 "nonimmediate_operand")))]
+ "TARGET_SSE && ix86_pre_reload_split ()
+ && rtx_equal_p (operands[2], operands[4])"
+ "#"
+ "&& 1"
+ [(set (match_dup 0) (match_dup 5))]
+ {
+ int u = UNSPEC_IEEE_MIN;
+ if (GET_CODE (operands[1]) == GT)
+ u = UNSPEC_IEEE_MAX;
+
+ rtx tmp = force_reg (<MODE>mode, operands[3]);
+ rtvec v = gen_rtvec (2, operands[2], tmp);
+ operands[5] = gen_rtx_UNSPEC (<MODE>mode, v, u);
})
;; These versions of the min/max patterns implement exactly the operations
@@ -4650,6 +4699,9 @@
UNSPEC_PCMP))]
"operands[4] = GEN_INT (INTVAL (operands[3]) ^ 4);")
+(define_int_iterator UNSPEC_PCMP_ITER
+ [UNSPEC_PCMP UNSPEC_UNSIGNED_PCMP])
+
(define_insn "*<avx512>_cmp<mode>3_and15"
[(set (match_operand:QI 0 "register_operand" "=k")
(and:QI
@@ -4682,6 +4734,23 @@
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
+(define_insn "*<avx512>_eq<mode>3_and15"
+ [(set (match_operand:QI 0 "register_operand" "=k, k")
+ (and:QI
+ (unspec:QI
+ [(match_operand:VI48_AVX512VL_4 1 "nonimm_or_0_operand" "%v, v")
+ (match_operand:VI48_AVX512VL_4 2 "nonimm_or_0_operand" "vm, C")
+ (const_int 0)]
+ UNSPEC_PCMP_ITER)
+ (const_int 15)))]
+ "TARGET_AVX512F && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+ "@
+ vpcmpeq<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}
+ vptestnm<ssemodesuffix>\t{%1, %1, %0|%0, %1, %1}"
+ [(set_attr "type" "ssecmp")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
(define_insn "*<avx512>_cmp<mode>3_and3"
[(set (match_operand:QI 0 "register_operand" "=k")
(and:QI
@@ -4714,6 +4783,23 @@
(set_attr "prefix" "evex")
(set_attr "mode" "TI")])
+(define_insn "*avx512vl_eqv2di_and3"
+ [(set (match_operand:QI 0 "register_operand" "=k, k")
+ (and:QI
+ (unspec:QI
+ [(match_operand:V2DI 1 "nonimm_or_0_operand" "%v, v")
+ (match_operand:V2DI 2 "nonimm_or_0_operand" "vm, C")
+ (const_int 0)]
+ UNSPEC_PCMP_ITER)
+ (const_int 3)))]
+ "TARGET_AVX512F && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+ "@
+ vpcmpeqq\t{%2, %1, %0|%0, %1, %2}
+ vptestnmq\t{%1, %1, %0|%0, %1, %1}"
+ [(set_attr "type" "ssecmp")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "TI")])
+
(define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name>"
[(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
(unspec:<avx512fmaskmode>
@@ -4787,9 +4873,6 @@
(set_attr "prefix" "evex")
(set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")])
-(define_int_iterator UNSPEC_PCMP_ITER
- [UNSPEC_PCMP UNSPEC_UNSIGNED_PCMP])
-
(define_insn_and_split "*<avx512>_cmp<mode>3"
[(set (match_operand:<avx512fmaskmode> 0 "register_operand")
(not:<avx512fmaskmode>
@@ -4817,7 +4900,8 @@
(match_operand:SI 3 "<cmp_imm_predicate>")]
UNSPEC_PCMP_ITER))]
"TARGET_AVX512F && ix86_pre_reload_split ()
- && rtx_equal_p (operands[1], operands[2])"
+ && rtx_equal_p (operands[1], operands[2])
+ && (!MEM_P (operands[1]) || !MEM_VOLATILE_P (operands[1]))"
"#"
"&& 1"
[(set (match_dup 0) (match_dup 4))]
@@ -27256,24 +27340,6 @@
DONE;
})
-(define_expand "cond_<insn><mode>"
- [(set (match_operand:VI1_AVX512VL 0 "register_operand")
- (vec_merge:VI1_AVX512VL
- (any_shift:VI1_AVX512VL
- (match_operand:VI1_AVX512VL 2 "register_operand")
- (match_operand:VI1_AVX512VL 3 "const_vec_dup_operand"))
- (match_operand:VI1_AVX512VL 4 "nonimm_or_0_operand")
- (match_operand:<avx512fmaskmode> 1 "register_operand")))]
- "TARGET_GFNI && TARGET_AVX512F"
-{
- rtx count = XVECEXP (operands[3], 0, 0);
- rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], count, <CODE>);
- emit_insn (gen_vgf2p8affineqb_<mode>_mask (operands[0], operands[2], matrix,
- const0_rtx, operands[4],
- operands[1]));
- DONE;
-})
-
(define_expand "<insn><mode>3"
[(set (match_operand:VI1_AVX512_3264 0 "register_operand")
(any_rotate:VI1_AVX512_3264
@@ -32334,8 +32400,8 @@
(define_expand "usdot_prod<sseunpackmodelower><mode>"
[(match_operand:<sseunpackmode> 0 "register_operand")
- (match_operand:VI2_AVX512F 1 "register_operand")
- (match_operand:VI2_AVX512F 2 "register_operand")
+ (match_operand:VI2_AVX10_2 1 "register_operand")
+ (match_operand:VI2_AVX10_2 2 "register_operand")
(match_operand:<sseunpackmode> 3 "register_operand")]
"TARGET_AVXVNNIINT16 || TARGET_AVX10_2"
{
@@ -32352,8 +32418,8 @@
(define_expand "udot_prod<sseunpackmodelower><mode>"
[(match_operand:<sseunpackmode> 0 "register_operand")
- (match_operand:VI2_AVX512F 1 "register_operand")
- (match_operand:VI2_AVX512F 2 "register_operand")
+ (match_operand:VI2_AVX10_2 1 "register_operand")
+ (match_operand:VI2_AVX10_2 2 "register_operand")
(match_operand:<sseunpackmode> 3 "register_operand")]
"TARGET_AVXVNNIINT16 || TARGET_AVX10_2"
{
diff --git a/gcc/config/i386/x-mingw32 b/gcc/config/i386/x-mingw32
index 8900bfc..5ebe088 100644
--- a/gcc/config/i386/x-mingw32
+++ b/gcc/config/i386/x-mingw32
@@ -21,6 +21,9 @@
#
local_includedir=$(libsubdir)/$(unlibsubdir)/..`echo $(exec_prefix) | sed -e 's|^$(prefix)||' -e 's|/[^/]*|/..|g'`/include
+# Add Windows socket library.
+LIBS += -lws2_32
+
# On MinGW, we use "%IA64d" to print 64-bit integers, and the format-checking
# code does not handle that, so we have to disable checking here.
WERROR_FLAGS += -Wno-format
diff --git a/gcc/config/i386/x86-tune-sched.cc b/gcc/config/i386/x86-tune-sched.cc
index ff9c268..11b3338 100644
--- a/gcc/config/i386/x86-tune-sched.cc
+++ b/gcc/config/i386/x86-tune-sched.cc
@@ -110,6 +110,9 @@ ix86_issue_rate (void)
case PROCESSOR_PANTHERLAKE:
return 6;
+ case PROCESSOR_NOVALAKE:
+ return 8;
+
default:
return 1;
}
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 3627312..dcd26d5 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -602,7 +602,7 @@ DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2
/* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX
instructions in the auto-vectorizer. */
-DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512)
+DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512 | m_NOVALAKE)
/* X86_TUNE_AVX256_AVOID_VEC_PERM: Avoid using 256-bit cross-lane
vector permutation instructions in the auto-vectorizer. */