aboutsummaryrefslogtreecommitdiff
path: root/gcc/config
diff options
context:
space:
mode:
authorIan Lance Taylor <iant@golang.org>2021-10-07 15:28:36 -0700
committerIan Lance Taylor <iant@golang.org>2021-10-07 15:28:36 -0700
commit0b6b70a0733672600644c8df96942cda5bf86d3d (patch)
tree9a1fbd7f782c54df55ab225ed1be057e3f3b0b8a /gcc/config
parenta5b5cabc91c38710adbe5c8a2b53882abe994441 (diff)
parentfba228e259dd5112851527f2dbb62c5601100985 (diff)
downloadgcc-0b6b70a0733672600644c8df96942cda5bf86d3d.zip
gcc-0b6b70a0733672600644c8df96942cda5bf86d3d.tar.gz
gcc-0b6b70a0733672600644c8df96942cda5bf86d3d.tar.bz2
Merge from trunk revision fba228e259dd5112851527f2dbb62c5601100985.
Diffstat (limited to 'gcc/config')
-rw-r--r--gcc/config/aarch64/aarch64-arches.def2
-rw-r--r--gcc/config/aarch64/aarch64-builtins.c12
-rw-r--r--gcc/config/aarch64/aarch64-cores.def9
-rw-r--r--gcc/config/aarch64/aarch64-option-extensions.def3
-rw-r--r--gcc/config/aarch64/aarch64-simd-builtins.def4
-rw-r--r--gcc/config/aarch64/aarch64-tune.md2
-rw-r--r--gcc/config/aarch64/aarch64.c67
-rw-r--r--gcc/config/aarch64/aarch64.h15
-rw-r--r--gcc/config/aarch64/arm_neon.h32
-rw-r--r--gcc/config/arm/arm-cpus.in10
-rw-r--r--gcc/config/arm/arm-tables.opt3
-rw-r--r--gcc/config/arm/arm-tune.md2
-rw-r--r--gcc/config/darwin.h2
-rw-r--r--gcc/config/gcn/gcn-hsa.h65
-rw-r--r--gcc/config/gcn/gcn-valu.md16
-rw-r--r--gcc/config/gcn/gcn.c50
-rw-r--r--gcc/config/gcn/mkoffload.c112
-rw-r--r--gcc/config/i386/avx512fp16intrin.h1127
-rw-r--r--gcc/config/i386/avx512fp16vlintrin.h500
-rw-r--r--gcc/config/i386/i386-builtin-types.def5
-rw-r--r--gcc/config/i386/i386-builtin.def40
-rw-r--r--gcc/config/i386/i386-expand.c10
-rw-r--r--gcc/config/i386/i386-features.c15
-rw-r--r--gcc/config/i386/i386.c28
-rw-r--r--gcc/config/i386/i386.h13
-rw-r--r--gcc/config/i386/i386.md151
-rw-r--r--gcc/config/i386/mmx.md72
-rw-r--r--gcc/config/i386/sse.md608
-rw-r--r--gcc/config/i386/subst.md102
-rw-r--r--gcc/config/lm32/uclinux-elf.h1
-rw-r--r--gcc/config/pa/pa.c10
-rw-r--r--gcc/config/pru/constraints.md5
-rw-r--r--gcc/config/pru/predicates.md19
-rw-r--r--gcc/config/pru/pru-pragma.c2
-rw-r--r--gcc/config/pru/pru-protos.h3
-rw-r--r--gcc/config/pru/pru.c156
-rw-r--r--gcc/config/pru/pru.h5
-rw-r--r--gcc/config/pru/pru.md102
-rw-r--r--gcc/config/riscv/riscv.md10
-rw-r--r--gcc/config/rs6000/darwin.h5
-rw-r--r--gcc/config/rs6000/rs6000-call.c64
-rw-r--r--gcc/config/rs6000/rs6000.c22
-rw-r--r--gcc/config/rs6000/rs6000.opt38
-rw-r--r--gcc/config/rs6000/vxworks.h4
-rw-r--r--gcc/config/s390/s390.c10
-rw-r--r--gcc/config/s390/tpf.md6
46 files changed, 3225 insertions, 314 deletions
diff --git a/gcc/config/aarch64/aarch64-arches.def b/gcc/config/aarch64/aarch64-arches.def
index b749727..a3b32e0 100644
--- a/gcc/config/aarch64/aarch64-arches.def
+++ b/gcc/config/aarch64/aarch64-arches.def
@@ -37,6 +37,8 @@ AARCH64_ARCH("armv8.3-a", generic, 8_3A, 8, AARCH64_FL_FOR_ARCH8_3)
AARCH64_ARCH("armv8.4-a", generic, 8_4A, 8, AARCH64_FL_FOR_ARCH8_4)
AARCH64_ARCH("armv8.5-a", generic, 8_5A, 8, AARCH64_FL_FOR_ARCH8_5)
AARCH64_ARCH("armv8.6-a", generic, 8_6A, 8, AARCH64_FL_FOR_ARCH8_6)
+AARCH64_ARCH("armv8.7-a", generic, 8_7A, 8, AARCH64_FL_FOR_ARCH8_7)
AARCH64_ARCH("armv8-r", generic, 8R , 8, AARCH64_FL_FOR_ARCH8_R)
+AARCH64_ARCH("armv9-a", generic, 9A , 9, AARCH64_FL_FOR_ARCH9)
#undef AARCH64_ARCH
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 119f67d..1a507ea 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -182,6 +182,10 @@ static enum aarch64_type_qualifiers
aarch64_types_binopp_qualifiers[SIMD_MAX_BUILTIN_ARGS]
= { qualifier_poly, qualifier_poly, qualifier_poly };
#define TYPES_BINOPP (aarch64_types_binopp_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_binop_ppu_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+ = { qualifier_poly, qualifier_poly, qualifier_unsigned };
+#define TYPES_BINOP_PPU (aarch64_types_binop_ppu_qualifiers)
static enum aarch64_type_qualifiers
aarch64_types_ternop_qualifiers[SIMD_MAX_BUILTIN_ARGS]
@@ -207,6 +211,10 @@ aarch64_types_ternopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
qualifier_unsigned, qualifier_immediate };
#define TYPES_TERNOPUI (aarch64_types_ternopu_imm_qualifiers)
static enum aarch64_type_qualifiers
+aarch64_types_ternop_sssu_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+ = { qualifier_none, qualifier_none, qualifier_none, qualifier_unsigned };
+#define TYPES_TERNOP_SSSU (aarch64_types_ternop_sssu_qualifiers)
+static enum aarch64_type_qualifiers
aarch64_types_ternop_ssus_qualifiers[SIMD_MAX_BUILTIN_ARGS]
= { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_none };
#define TYPES_TERNOP_SSUS (aarch64_types_ternop_ssus_qualifiers)
@@ -214,6 +222,10 @@ static enum aarch64_type_qualifiers
aarch64_types_ternop_suss_qualifiers[SIMD_MAX_BUILTIN_ARGS]
= { qualifier_none, qualifier_unsigned, qualifier_none, qualifier_none };
#define TYPES_TERNOP_SUSS (aarch64_types_ternop_suss_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_binop_pppu_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+ = { qualifier_poly, qualifier_poly, qualifier_poly, qualifier_unsigned };
+#define TYPES_TERNOP_PPPU (aarch64_types_binop_pppu_qualifiers)
static enum aarch64_type_qualifiers
diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index b2aa167..77da310 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -162,4 +162,13 @@ AARCH64_CORE("cortex-a76.cortex-a55", cortexa76cortexa55, cortexa53, 8_2A, AAR
/* Armv8-R Architecture Processors. */
AARCH64_CORE("cortex-r82", cortexr82, cortexa53, 8R, AARCH64_FL_FOR_ARCH8_R, cortexa53, 0x41, 0xd15, -1)
+/* Armv9.0-A Architecture Processors. */
+
+/* Arm ('A') cores. */
+AARCH64_CORE("cortex-a510", cortexa510, cortexa55, 9A, AARCH64_FL_FOR_ARCH9 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_MEMTAG | AARCH64_FL_I8MM | AARCH64_FL_BF16, cortexa53, 0x41, 0xd46, -1)
+
+AARCH64_CORE("cortex-a710", cortexa710, cortexa57, 9A, AARCH64_FL_FOR_ARCH9 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_MEMTAG | AARCH64_FL_I8MM | AARCH64_FL_BF16, neoversen2, 0x41, 0xd47, -1)
+
+AARCH64_CORE("cortex-x2", cortexx2, cortexa57, 9A, AARCH64_FL_FOR_ARCH9 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_MEMTAG | AARCH64_FL_I8MM | AARCH64_FL_BF16, neoversen2, 0x41, 0xd48, -1)
+
#undef AARCH64_CORE
diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
index 579328c..b61f1df 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -232,4 +232,7 @@ AARCH64_OPT_EXTENSION("flagm", AARCH64_FL_FLAGM, 0, 0, false, "flagm")
/* Enabling/Disabling "pauth" only changes "pauth". */
AARCH64_OPT_EXTENSION("pauth", AARCH64_FL_PAUTH, 0, 0, false, "paca pacg")
+/* Enabling/Disabling "ls64" only changes "ls64". */
+AARCH64_OPT_EXTENSION("ls64", AARCH64_FL_LS64, 0, 0, false, "")
+
#undef AARCH64_OPT_EXTENSION
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 402453a..35dc075 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -721,6 +721,8 @@
/* Implemented by aarch64_qtbl1<mode>. */
VAR2 (BINOP, qtbl1, 0, NONE, v8qi, v16qi)
VAR2 (BINOPU, qtbl1, 0, NONE, v8qi, v16qi)
+ VAR2 (BINOP_PPU, qtbl1, 0, NONE, v8qi, v16qi)
+ VAR2 (BINOP_SSU, qtbl1, 0, NONE, v8qi, v16qi)
/* Implemented by aarch64_qtbl2<mode>. */
VAR2 (BINOP, qtbl2, 0, NONE, v8qi, v16qi)
@@ -734,6 +736,8 @@
/* Implemented by aarch64_qtbx1<mode>. */
VAR2 (TERNOP, qtbx1, 0, NONE, v8qi, v16qi)
VAR2 (TERNOPU, qtbx1, 0, NONE, v8qi, v16qi)
+ VAR2 (TERNOP_PPPU, qtbx1, 0, NONE, v8qi, v16qi)
+ VAR2 (TERNOP_SSSU, qtbx1, 0, NONE, v8qi, v16qi)
/* Implemented by aarch64_qtbx2<mode>. */
VAR2 (TERNOP, qtbx2, 0, NONE, v8qi, v16qi)
diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md
index e491c29..12be913 100644
--- a/gcc/config/aarch64/aarch64-tune.md
+++ b/gcc/config/aarch64/aarch64-tune.md
@@ -1,5 +1,5 @@
;; -*- buffer-read-only: t -*-
;; Generated automatically by gentune.sh from aarch64-cores.def
(define_attr "tune"
- "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,neoversen2,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82"
+ "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,neoversen2,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexx2"
(const (symbol_ref "((enum attr_tune) aarch64_tune)")))
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 36519cc..a9a1800 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -23390,7 +23390,8 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
}
/* Expand cpymem, as if from a __builtin_memcpy. Return true if
- we succeed, otherwise return false. */
+ we succeed, otherwise return false, indicating that a libcall to
+ memcpy should be emitted. */
bool
aarch64_expand_cpymem (rtx *operands)
@@ -23407,11 +23408,13 @@ aarch64_expand_cpymem (rtx *operands)
unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
- /* Inline up to 256 bytes when optimizing for speed. */
+ /* Try to inline up to 256 bytes. */
unsigned HOST_WIDE_INT max_copy_size = 256;
- if (optimize_function_for_size_p (cfun))
- max_copy_size = 128;
+ bool size_p = optimize_function_for_size_p (cfun);
+
+ if (size > max_copy_size)
+ return false;
int copy_bits = 256;
@@ -23421,13 +23424,14 @@ aarch64_expand_cpymem (rtx *operands)
|| !TARGET_SIMD
|| (aarch64_tune_params.extra_tuning_flags
& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
- {
- copy_bits = 128;
- max_copy_size = max_copy_size / 2;
- }
+ copy_bits = 128;
- if (size > max_copy_size)
- return false;
+ /* Emit an inline load+store sequence and count the number of operations
+ involved. We use a simple count of just the loads and stores emitted
+ rather than rtx_insn count as all the pointer adjustments and reg copying
+ in this function will get optimized away later in the pipeline. */
+ start_sequence ();
+ unsigned nops = 0;
base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
dst = adjust_automodify_address (dst, VOIDmode, base, 0);
@@ -23456,7 +23460,8 @@ aarch64_expand_cpymem (rtx *operands)
cur_mode = V4SImode;
aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
-
+ /* A single block copy is 1 load + 1 store. */
+ nops += 2;
n -= mode_bits;
/* Emit trailing copies using overlapping unaligned accesses - this is
@@ -23471,7 +23476,16 @@ aarch64_expand_cpymem (rtx *operands)
n = n_bits;
}
}
+ rtx_insn *seq = get_insns ();
+ end_sequence ();
+
+ /* A memcpy libcall in the worst case takes 3 instructions to prepare the
+ arguments + 1 for the call. */
+ unsigned libcall_cost = 4;
+ if (size_p && libcall_cost < nops)
+ return false;
+ emit_insn (seq);
return true;
}
@@ -23534,40 +23548,37 @@ aarch64_expand_setmem (rtx *operands)
if (!CONST_INT_P (operands[1]))
return false;
- bool speed_p = !optimize_function_for_size_p (cfun);
+ bool size_p = optimize_function_for_size_p (cfun);
/* Default the maximum to 256-bytes. */
unsigned max_set_size = 256;
- /* In case we are optimizing for size or if the core does not
- want to use STP Q regs, lower the max_set_size. */
- max_set_size = (!speed_p
- || (aarch64_tune_params.extra_tuning_flags
- & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
- ? max_set_size / 2 : max_set_size;
-
len = INTVAL (operands[1]);
/* Upper bound check. */
if (len > max_set_size)
return false;
+ /* Attempt a sequence with a vector broadcast followed by stores.
+ Count the number of operations involved to see if it's worth it for
+ code size. */
+ start_sequence ();
+ unsigned nops = 0;
base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
dst = adjust_automodify_address (dst, VOIDmode, base, 0);
/* Prepare the val using a DUP/MOVI v0.16B, val. */
src = expand_vector_broadcast (V16QImode, val);
src = force_reg (V16QImode, src);
-
+ nops++;
/* Convert len to bits to make the rest of the code simpler. */
n = len * BITS_PER_UNIT;
/* Maximum amount to copy in one go. We allow 256-bit chunks based on the
AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. setmem expand
pattern is only turned on for TARGET_SIMD. */
- const int copy_limit = (speed_p
- && (aarch64_tune_params.extra_tuning_flags
- & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
+ const int copy_limit = (aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
? GET_MODE_BITSIZE (TImode) : 256;
while (n > 0)
@@ -23583,7 +23594,7 @@ aarch64_expand_setmem (rtx *operands)
mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
-
+ nops++;
n -= mode_bits;
/* Do certain trailing copies as overlapping if it's going to be
@@ -23599,7 +23610,15 @@ aarch64_expand_setmem (rtx *operands)
n = n_bits;
}
}
+ rtx_insn *seq = get_insns ();
+ end_sequence ();
+ /* A call to memset in the worst case requires 3 instructions to prepare
+ the arguments + 1 for the call. Prefer the inline sequence for size if
+ it is no longer than that. */
+ if (size_p && nops > 4)
+ return false;
+ emit_insn (seq);
return true;
}
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index a5ba6c2..2792bb2 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -231,6 +231,15 @@ extern unsigned aarch64_architecture_version;
/* Pointer Authentication (PAUTH) extension. */
#define AARCH64_FL_PAUTH (1ULL << 40)
+/* 64-byte atomic load/store extensions. */
+#define AARCH64_FL_LS64 (1ULL << 41)
+
+/* Armv8.7-a architecture extensions. */
+#define AARCH64_FL_V8_7 (1ULL << 42)
+
+/* Armv9.0-A. */
+#define AARCH64_FL_V9 (1ULL << 43) /* Armv9.0-A Architecture. */
+
/* Has FP and SIMD. */
#define AARCH64_FL_FPSIMD (AARCH64_FL_FP | AARCH64_FL_SIMD)
@@ -255,8 +264,13 @@ extern unsigned aarch64_architecture_version;
#define AARCH64_FL_FOR_ARCH8_6 \
(AARCH64_FL_FOR_ARCH8_5 | AARCH64_FL_V8_6 | AARCH64_FL_FPSIMD \
| AARCH64_FL_I8MM | AARCH64_FL_BF16)
+#define AARCH64_FL_FOR_ARCH8_7 \
+ (AARCH64_FL_FOR_ARCH8_6 | AARCH64_FL_V8_7 | AARCH64_FL_LS64)
+
#define AARCH64_FL_FOR_ARCH8_R \
(AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_V8_R)
+#define AARCH64_FL_FOR_ARCH9 \
+ (AARCH64_FL_FOR_ARCH8_5 | AARCH64_FL_SVE | AARCH64_FL_SVE2 | AARCH64_FL_V9)
/* Macros to test ISA flags. */
@@ -295,6 +309,7 @@ extern unsigned aarch64_architecture_version;
#define AARCH64_ISA_SB (aarch64_isa_flags & AARCH64_FL_SB)
#define AARCH64_ISA_V8_R (aarch64_isa_flags & AARCH64_FL_V8_R)
#define AARCH64_ISA_PAUTH (aarch64_isa_flags & AARCH64_FL_PAUTH)
+#define AARCH64_ISA_V9 (aarch64_isa_flags & AARCH64_FL_V9)
/* Crypto is an optional extension to AdvSIMD. */
#define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 635a223..2d5bf34 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -10416,15 +10416,14 @@ __extension__ extern __inline poly8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbl1_p8 (poly8x16_t __tab, uint8x8_t __idx)
{
- return (poly8x8_t) __builtin_aarch64_qtbl1v8qi ((int8x16_t) __tab,
- (int8x8_t) __idx);
+ return __builtin_aarch64_qtbl1v8qi_ppu (__tab, __idx);
}
__extension__ extern __inline int8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbl1_s8 (int8x16_t __tab, uint8x8_t __idx)
{
- return __builtin_aarch64_qtbl1v8qi (__tab, (int8x8_t) __idx);
+ return __builtin_aarch64_qtbl1v8qi_ssu (__tab, __idx);
}
__extension__ extern __inline uint8x8_t
@@ -10438,15 +10437,14 @@ __extension__ extern __inline poly8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbl1q_p8 (poly8x16_t __tab, uint8x16_t __idx)
{
- return (poly8x16_t) __builtin_aarch64_qtbl1v16qi ((int8x16_t) __tab,
- (int8x16_t) __idx);
+ return __builtin_aarch64_qtbl1v16qi_ppu (__tab, __idx);
}
__extension__ extern __inline int8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbl1q_s8 (int8x16_t __tab, uint8x16_t __idx)
{
- return __builtin_aarch64_qtbl1v16qi (__tab, (int8x16_t) __idx);
+ return __builtin_aarch64_qtbl1v16qi_ssu (__tab, __idx);
}
__extension__ extern __inline uint8x16_t
@@ -10460,7 +10458,7 @@ __extension__ extern __inline int8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbx1_s8 (int8x8_t __r, int8x16_t __tab, uint8x8_t __idx)
{
- return __builtin_aarch64_qtbx1v8qi (__r, __tab, (int8x8_t) __idx);
+ return __builtin_aarch64_qtbx1v8qi_sssu (__r, __tab, __idx);
}
__extension__ extern __inline uint8x8_t
@@ -10474,16 +10472,14 @@ __extension__ extern __inline poly8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbx1_p8 (poly8x8_t __r, poly8x16_t __tab, uint8x8_t __idx)
{
- return (poly8x8_t) __builtin_aarch64_qtbx1v8qi ((int8x8_t) __r,
- (int8x16_t) __tab,
- (int8x8_t) __idx);
+ return __builtin_aarch64_qtbx1v8qi_pppu (__r, __tab, __idx);
}
__extension__ extern __inline int8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbx1q_s8 (int8x16_t __r, int8x16_t __tab, uint8x16_t __idx)
{
- return __builtin_aarch64_qtbx1v16qi (__r, __tab, (int8x16_t) __idx);
+ return __builtin_aarch64_qtbx1v16qi_sssu (__r, __tab, __idx);
}
__extension__ extern __inline uint8x16_t
@@ -10497,9 +10493,7 @@ __extension__ extern __inline poly8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqtbx1q_p8 (poly8x16_t __r, poly8x16_t __tab, uint8x16_t __idx)
{
- return (poly8x16_t) __builtin_aarch64_qtbx1v16qi ((int8x16_t) __r,
- (int8x16_t) __tab,
- (int8x16_t) __idx);
+ return __builtin_aarch64_qtbx1v16qi_pppu (__r, __tab, __idx);
}
/* V7 legacy table intrinsics. */
@@ -10528,8 +10522,7 @@ vtbl1_p8 (poly8x8_t __tab, uint8x8_t __idx)
{
poly8x16_t __temp = vcombine_p8 (__tab,
vcreate_p8 (__AARCH64_UINT64_C (0x0)));
- return (poly8x8_t) __builtin_aarch64_qtbl1v8qi ((int8x16_t) __temp,
- (int8x8_t) __idx);
+ return __builtin_aarch64_qtbl1v8qi_ppu (__temp, __idx);
}
__extension__ extern __inline int8x8_t
@@ -10553,8 +10546,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vtbl2_p8 (poly8x8x2_t __tab, uint8x8_t __idx)
{
poly8x16_t __temp = vcombine_p8 (__tab.val[0], __tab.val[1]);
- return (poly8x8_t) __builtin_aarch64_qtbl1v8qi ((int8x16_t) __temp,
- (int8x8_t) __idx);
+ return __builtin_aarch64_qtbl1v8qi_ppu (__temp, __idx);
}
__extension__ extern __inline int8x8_t
@@ -10653,9 +10645,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vtbx2_p8 (poly8x8_t __r, poly8x8x2_t __tab, uint8x8_t __idx)
{
poly8x16_t __temp = vcombine_p8 (__tab.val[0], __tab.val[1]);
- return (poly8x8_t) __builtin_aarch64_qtbx1v8qi ((int8x8_t) __r,
- (int8x16_t) __temp,
- (int8x8_t) __idx);
+ return __builtin_aarch64_qtbx1v8qi_pppu (__r, __temp, __idx);
}
/* End of temporary inline asm. */
diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in
index bcc9ebe..d0d0d0f 100644
--- a/gcc/config/arm/arm-cpus.in
+++ b/gcc/config/arm/arm-cpus.in
@@ -1612,6 +1612,16 @@ begin cpu cortex-r52
part d13
end cpu cortex-r52
+begin cpu cortex-r52plus
+ cname cortexr52plus
+ tune flags LDSCHED
+ architecture armv8-r+crc+simd
+ option nofp.dp remove FP_DBL ALL_SIMD
+ costs cortex
+ vendor 41
+ part d16
+end cpu cortex-r52plus
+
# FPU entries
# format:
# begin fpu <name>
diff --git a/gcc/config/arm/arm-tables.opt b/gcc/config/arm/arm-tables.opt
index 5692d4f..8bb0c9f 100644
--- a/gcc/config/arm/arm-tables.opt
+++ b/gcc/config/arm/arm-tables.opt
@@ -282,6 +282,9 @@ Enum(processor_type) String(cortex-m55) Value( TARGET_CPU_cortexm55)
EnumValue
Enum(processor_type) String(cortex-r52) Value( TARGET_CPU_cortexr52)
+EnumValue
+Enum(processor_type) String(cortex-r52plus) Value( TARGET_CPU_cortexr52plus)
+
Enum
Name(arm_arch) Type(int)
Known ARM architectures (for use with the -march= option):
diff --git a/gcc/config/arm/arm-tune.md b/gcc/config/arm/arm-tune.md
index b9df864..6482833 100644
--- a/gcc/config/arm/arm-tune.md
+++ b/gcc/config/arm/arm-tune.md
@@ -49,5 +49,5 @@
cortexx1,neoversen1,cortexa75cortexa55,
cortexa76cortexa55,neoversev1,neoversen2,
cortexm23,cortexm33,cortexm35p,
- cortexm55,cortexr52"
+ cortexm55,cortexr52,cortexr52plus"
(const (symbol_ref "((enum attr_tune) arm_tune)")))
diff --git a/gcc/config/darwin.h b/gcc/config/darwin.h
index 50524a5..0fa1c57 100644
--- a/gcc/config/darwin.h
+++ b/gcc/config/darwin.h
@@ -251,7 +251,7 @@ extern GTY(()) int darwin_ms_struct;
%{v} \
%{g*:%{!gctf:%{!gbtf:%{!gstabs*:%{%:debug-level-gt(0): -idsym}}}}}\
%{.c|.cc|.C|.cpp|.cp|.c++|.cxx|.CPP|.m|.mm|.s|.f|.f90|\
- .f95|.f03|.f77|.for|.F|.F90|.F95|.F03: \
+ .f95|.f03|.f77|.for|.F|.F90|.F95|.F03|.d: \
%{g*:%{!gctf:%{!gbtf:%{!gstabs*:%{%:debug-level-gt(0): -dsym}}}}}}}}}}}}}"
#define LINK_COMMAND_SPEC LINK_COMMAND_SPEC_A DSYMUTIL_SPEC
diff --git a/gcc/config/gcn/gcn-hsa.h b/gcc/config/gcn/gcn-hsa.h
index fc99c8d..6a432d1 100644
--- a/gcc/config/gcn/gcn-hsa.h
+++ b/gcc/config/gcn/gcn-hsa.h
@@ -75,25 +75,66 @@ extern unsigned int gcn_local_sym_hash (const char *name);
supported for gcn. */
#define GOMP_SELF_SPECS ""
+#ifdef HAVE_GCN_XNACK_FIJI
+#define X_FIJI
+#else
+#define X_FIJI "!march=*:;march=fiji:;"
+#endif
+#ifdef HAVE_GCN_XNACK_GFX900
+#define X_900
+#else
+#define X_900 "march=gfx900:;"
+#endif
+#ifdef HAVE_GCN_XNACK_GFX906
+#define X_906
+#else
+#define X_906 "march=gfx906:;"
+#endif
+#ifdef HAVE_GCN_XNACK_GFX908
+#define X_908
+#else
+#define X_908 "march=gfx908:;"
+#endif
+
#ifdef HAVE_GCN_SRAM_ECC_FIJI
-#define A_FIJI
+#define S_FIJI
#else
-#define A_FIJI "!march=*:;march=fiji:;"
+#define S_FIJI "!march=*:;march=fiji:;"
#endif
#ifdef HAVE_GCN_SRAM_ECC_GFX900
-#define A_900
+#define S_900
#else
-#define A_900 "march=gfx900:;"
+#define S_900 "march=gfx900:;"
#endif
#ifdef HAVE_GCN_SRAM_ECC_GFX906
-#define A_906
+#define S_906
#else
-#define A_906 "march=gfx906:;"
+#define S_906 "march=gfx906:;"
#endif
#ifdef HAVE_GCN_SRAM_ECC_GFX908
-#define A_908
+#define S_908
+#else
+#define S_908 "march=gfx908:;"
+#endif
+
+#ifdef HAVE_GCN_ASM_V3_SYNTAX
+#define SRAMOPT "!msram-ecc=off:-mattr=+sram-ecc;:-mattr=-sram-ecc"
+#endif
+#ifdef HAVE_GCN_ASM_V4_SYNTAX
+/* In HSACOv4 no attribute setting means the binary supports "any" hardware
+ configuration. The name of the attribute also changed. */
+#define SRAMOPT "msram-ecc=on:-mattr=+sramecc;msram-ecc=off:-mattr=-sramecc"
+#endif
+#if !defined(SRAMOPT) && !defined(IN_LIBGCC2)
+#error "No assembler syntax configured"
+#endif
+
+#ifdef HAVE_GCN_ASM_V4_SYNTAX
+/* FIJI cards don't seem to support drivers new enough to allow HSACOv4. */
+#define HSACO3_SELECT_OPT \
+ "%{!march=*|march=fiji:--amdhsa-code-object-version=3} "
#else
-#define A_908 "march=gfx908:;"
+#define HSACO3_SELECT_OPT
#endif
/* These targets can't have SRAM-ECC, even if a broken assembler allows it. */
@@ -103,10 +144,10 @@ extern unsigned int gcn_local_sym_hash (const char *name);
/* Use LLVM assembler and linker options. */
#define ASM_SPEC "-triple=amdgcn--amdhsa " \
"%:last_arg(%{march=*:-mcpu=%*}) " \
- "-mattr=%{mxnack:+xnack;:-xnack} " \
- /* FIXME: support "any" when we move to HSACOv4. */ \
- "-mattr=%{" A_FIJI A_900 A_906 A_908 \
- "!msram-ecc=off:+sram-ecc;:-sram-ecc} " \
+ HSACO3_SELECT_OPT \
+ "%{" X_FIJI X_900 X_906 X_908 \
+ "mxnack:-mattr=+xnack;:-mattr=-xnack} " \
+ "%{" S_FIJI S_900 S_906 S_908 SRAMOPT "} " \
"-filetype=obj"
#define LINK_SPEC "--pie --export-dynamic"
#define LIB_SPEC "-lc"
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index 84ff675..01fdce6 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -827,8 +827,12 @@
/* Work around assembler bug in which a 64-bit register is expected,
but a 32-bit value would be correct. */
int reg = REGNO (operands[2]) - FIRST_VGPR_REG;
- sprintf (buf, "global_load%%o0\t%%0, v[%d:%d], %%1 offset:%%3%s\;"
- "s_waitcnt\tvmcnt(0)", reg, reg + 1, glc);
+ if (HAVE_GCN_ASM_GLOBAL_LOAD_FIXED)
+ sprintf (buf, "global_load%%o0\t%%0, v%d, %%1 offset:%%3%s\;"
+ "s_waitcnt\tvmcnt(0)", reg, glc);
+ else
+ sprintf (buf, "global_load%%o0\t%%0, v[%d:%d], %%1 offset:%%3%s\;"
+ "s_waitcnt\tvmcnt(0)", reg, reg + 1, glc);
}
else
gcc_unreachable ();
@@ -958,8 +962,12 @@
/* Work around assembler bug in which a 64-bit register is expected,
but a 32-bit value would be correct. */
int reg = REGNO (operands[1]) - FIRST_VGPR_REG;
- sprintf (buf, "global_store%%s3\tv[%d:%d], %%3, %%0 offset:%%2%s",
- reg, reg + 1, glc);
+ if (HAVE_GCN_ASM_GLOBAL_LOAD_FIXED)
+ sprintf (buf, "global_store%%s3\tv%d, %%3, %%0 offset:%%2%s",
+ reg, glc);
+ else
+ sprintf (buf, "global_store%%s3\tv[%d:%d], %%3, %%0 offset:%%2%s",
+ reg, reg + 1, glc);
}
else
gcc_unreachable ();
diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c
index 2a3fc96..2e90f32 100644
--- a/gcc/config/gcn/gcn.c
+++ b/gcc/config/gcn/gcn.c
@@ -5217,42 +5217,76 @@ static void
output_file_start (void)
{
const char *cpu;
- bool use_sram = flag_sram_ecc;
+ bool use_xnack_attr = true;
+ bool use_sram_attr = true;
switch (gcn_arch)
{
case PROCESSOR_FIJI:
cpu = "gfx803";
+#ifndef HAVE_GCN_XNACK_FIJI
+ use_xnack_attr = false;
+#endif
#ifndef HAVE_GCN_SRAM_ECC_FIJI
- use_sram = false;
+ use_sram_attr = false;
#endif
break;
case PROCESSOR_VEGA10:
cpu = "gfx900";
+#ifndef HAVE_GCN_XNACK_GFX900
+ use_xnack_attr = false;
+#endif
#ifndef HAVE_GCN_SRAM_ECC_GFX900
- use_sram = false;
+ use_sram_attr = false;
#endif
break;
case PROCESSOR_VEGA20:
cpu = "gfx906";
+#ifndef HAVE_GCN_XNACK_GFX906
+ use_xnack_attr = false;
+#endif
#ifndef HAVE_GCN_SRAM_ECC_GFX906
- use_sram = false;
+ use_sram_attr = false;
#endif
break;
case PROCESSOR_GFX908:
cpu = "gfx908";
+#ifndef HAVE_GCN_XNACK_GFX908
+ use_xnack_attr = false;
+#endif
#ifndef HAVE_GCN_SRAM_ECC_GFX908
- use_sram = false;
+ use_sram_attr = false;
#endif
break;
default: gcc_unreachable ();
}
+#if HAVE_GCN_ASM_V3_SYNTAX
const char *xnack = (flag_xnack ? "+xnack" : "");
- /* FIXME: support "any" when we move to HSACOv4. */
- const char *sram_ecc = (use_sram ? "+sram-ecc" : "");
+ const char *sram_ecc = (flag_sram_ecc ? "+sram-ecc" : "");
+#endif
+#if HAVE_GCN_ASM_V4_SYNTAX
+ /* In HSACOv4 no attribute setting means the binary supports "any" hardware
+ configuration. In GCC binaries, this is true for SRAM ECC, but not
+ XNACK. */
+ const char *xnack = (flag_xnack ? ":xnack+" : ":xnack-");
+ const char *sram_ecc = (flag_sram_ecc == SRAM_ECC_ON ? ":sramecc+"
+ : flag_sram_ecc == SRAM_ECC_OFF ? ":sramecc-"
+ : "");
+#endif
+ if (!use_xnack_attr)
+ xnack = "";
+ if (!use_sram_attr)
+ sram_ecc = "";
fprintf(asm_out_file, "\t.amdgcn_target \"amdgcn-unknown-amdhsa--%s%s%s\"\n",
- cpu, xnack, sram_ecc);
+ cpu,
+#if HAVE_GCN_ASM_V3_SYNTAX
+ xnack, sram_ecc
+#endif
+#ifdef HAVE_GCN_ASM_V4_SYNTAX
+ sram_ecc, xnack
+#endif
+ );
}
/* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h.
diff --git a/gcc/config/gcn/mkoffload.c b/gcc/config/gcn/mkoffload.c
index 732bdfd..a3b22d0 100644
--- a/gcc/config/gcn/mkoffload.c
+++ b/gcc/config/gcn/mkoffload.c
@@ -54,8 +54,51 @@
#undef EF_AMDGPU_MACH_AMDGCN_GFX908
#define EF_AMDGPU_MACH_AMDGCN_GFX908 0x30
-#define EF_AMDGPU_XNACK 0x100
-#define EF_AMDGPU_SRAM_ECC 0x200
+#define EF_AMDGPU_XNACK_V3 0x100
+#define EF_AMDGPU_SRAM_ECC_V3 0x200
+
+#define EF_AMDGPU_FEATURE_XNACK_V4 0x300 /* Mask. */
+#define EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4 0x000
+#define EF_AMDGPU_FEATURE_XNACK_ANY_V4 0x100
+#define EF_AMDGPU_FEATURE_XNACK_OFF_V4 0x200
+#define EF_AMDGPU_FEATURE_XNACK_ON_V4 0x300
+
+#define EF_AMDGPU_FEATURE_SRAMECC_V4 0xc00 /* Mask. */
+#define EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4 0x000
+#define EF_AMDGPU_FEATURE_SRAMECC_ANY_V4 0x400
+#define EF_AMDGPU_FEATURE_SRAMECC_OFF_V4 0x800
+#define EF_AMDGPU_FEATURE_SRAMECC_ON_V4 0xc00
+
+#ifdef HAVE_GCN_ASM_V3_SYNTAX
+#define SET_XNACK_ON(VAR) VAR |= EF_AMDGPU_XNACK_V3
+#define SET_XNACK_OFF(VAR) VAR &= ~EF_AMDGPU_XNACK_V3
+#define TEST_XNACK(VAR) (VAR & EF_AMDGPU_XNACK_V3)
+
+#define SET_SRAM_ECC_ON(VAR) VAR |= EF_AMDGPU_SRAM_ECC_V3
+#define SET_SRAM_ECC_ANY(VAR) SET_SRAM_ECC_ON (VAR)
+#define SET_SRAM_ECC_OFF(VAR) VAR &= ~EF_AMDGPU_SRAM_ECC_V3
+#define TEST_SRAM_ECC_ANY(VAR) 0 /* Not supported. */
+#define TEST_SRAM_ECC_ON(VAR) (VAR & EF_AMDGPU_SRAM_ECC_V3)
+#endif
+#ifdef HAVE_GCN_ASM_V4_SYNTAX
+#define SET_XNACK_ON(VAR) VAR = ((VAR & ~EF_AMDGPU_FEATURE_XNACK_V4) \
+ | EF_AMDGPU_FEATURE_XNACK_ON_V4)
+#define SET_XNACK_OFF(VAR) VAR = ((VAR & ~EF_AMDGPU_FEATURE_XNACK_V4) \
+ | EF_AMDGPU_FEATURE_XNACK_OFF_V4)
+#define TEST_XNACK(VAR) ((VAR & EF_AMDGPU_FEATURE_XNACK_V4) \
+ == EF_AMDGPU_FEATURE_XNACK_ON_V4)
+
+#define SET_SRAM_ECC_ON(VAR) VAR = ((VAR & ~EF_AMDGPU_FEATURE_SRAMECC_V4) \
+ | EF_AMDGPU_FEATURE_SRAMECC_ON_V4)
+#define SET_SRAM_ECC_ANY(VAR) VAR = ((VAR & ~EF_AMDGPU_FEATURE_SRAMECC_V4) \
+ | EF_AMDGPU_FEATURE_SRAMECC_ANY_V4)
+#define SET_SRAM_ECC_OFF(VAR) VAR = ((VAR & ~EF_AMDGPU_FEATURE_SRAMECC_V4) \
+ | EF_AMDGPU_FEATURE_SRAMECC_OFF_V4)
+#define TEST_SRAM_ECC_ANY(VAR) ((VAR & EF_AMDGPU_FEATURE_SRAMECC_V4) \
+ == EF_AMDGPU_FEATURE_SRAMECC_ANY_V4)
+#define TEST_SRAM_ECC_ON(VAR) ((VAR & EF_AMDGPU_FEATURE_SRAMECC_V4) \
+ == EF_AMDGPU_FEATURE_SRAMECC_ON_V4)
+#endif
#ifndef R_AMDGPU_NONE
#define R_AMDGPU_NONE 0
@@ -80,7 +123,13 @@ static struct obstack files_to_cleanup;
enum offload_abi offload_abi = OFFLOAD_ABI_UNSET;
uint32_t elf_arch = EF_AMDGPU_MACH_AMDGCN_GFX803; // Default GPU architecture.
-uint32_t elf_flags = 0;
+uint32_t elf_flags =
+#ifdef HAVE_GCN_ASM_V3_SYNTAX
+ 0;
+#endif
+#ifdef HAVE_GCN_ASM_V4_SYNTAX
+ (EF_AMDGPU_FEATURE_XNACK_ANY_V4 | EF_AMDGPU_FEATURE_SRAMECC_ANY_V4);
+#endif
/* Delete tempfiles. */
@@ -851,23 +900,22 @@ main (int argc, char **argv)
else if (strcmp (argv[i], "-fpic") == 0)
fpic = true;
else if (strcmp (argv[i], "-mxnack") == 0)
- elf_flags |= EF_AMDGPU_XNACK;
+ SET_XNACK_ON (elf_flags);
else if (strcmp (argv[i], "-mno-xnack") == 0)
- elf_flags &= ~EF_AMDGPU_XNACK;
+ SET_XNACK_OFF (elf_flags);
else if (strcmp (argv[i], "-msram-ecc=on") == 0)
{
- elf_flags |= EF_AMDGPU_SRAM_ECC;
+ SET_SRAM_ECC_ON (elf_flags);
sram_seen = true;
}
else if (strcmp (argv[i], "-msram-ecc=any") == 0)
{
- /* FIXME: change this when we move to HSACOv4. */
- elf_flags |= EF_AMDGPU_SRAM_ECC;
+ SET_SRAM_ECC_ANY (elf_flags);
sram_seen = true;
}
else if (strcmp (argv[i], "-msram-ecc=off") == 0)
{
- elf_flags &= ~EF_AMDGPU_SRAM_ECC;
+ SET_SRAM_ECC_OFF (elf_flags);
sram_seen = true;
}
else if (strcmp (argv[i], "-save-temps") == 0)
@@ -890,23 +938,27 @@ main (int argc, char **argv)
if (!(fopenacc ^ fopenmp))
fatal_error (input_location, "either -fopenacc or -fopenmp must be set");
- /* The SRAM-ECC feature defaults to "any" on GPUs where the feature is
- available. */
if (!sram_seen)
- switch (elf_arch)
- {
- case EF_AMDGPU_MACH_AMDGCN_GFX803:
- case EF_AMDGPU_MACH_AMDGCN_GFX900:
- case EF_AMDGPU_MACH_AMDGCN_GFX906:
+ {
+#ifdef HAVE_GCN_ASM_V3_SYNTAX
+ /* For HSACOv3, the SRAM-ECC feature defaults to "on" on GPUs where the
+ feature is available.
+ (HSACOv4 has elf_flags initialsed to "any" in all cases.) */
+ switch (elf_arch)
+ {
+ case EF_AMDGPU_MACH_AMDGCN_GFX803:
+ case EF_AMDGPU_MACH_AMDGCN_GFX900:
+ case EF_AMDGPU_MACH_AMDGCN_GFX906:
#ifndef HAVE_GCN_SRAM_ECC_GFX908
- case EF_AMDGPU_MACH_AMDGCN_GFX908:
+ case EF_AMDGPU_MACH_AMDGCN_GFX908:
#endif
- break;
- default:
- /* FIXME: change this when we move to HSACOv4. */
- elf_flags |= EF_AMDGPU_SRAM_ECC;
- break;
- }
+ break;
+ default:
+ SET_SRAM_ECC_ON (elf_flags);
+ break;
+ }
+#endif
+ }
const char *abi;
switch (offload_abi)
@@ -936,11 +988,12 @@ main (int argc, char **argv)
if (fopenmp)
obstack_ptr_grow (&cc_argv_obstack, "-mgomp");
obstack_ptr_grow (&cc_argv_obstack,
- (elf_flags & EF_AMDGPU_XNACK
+ (TEST_XNACK (elf_flags)
? "-mxnack" : "-mno-xnack"));
obstack_ptr_grow (&cc_argv_obstack,
- (elf_flags & EF_AMDGPU_SRAM_ECC
- ? "-msram-ecc=on" : "-msram-ecc=off"));
+ (TEST_SRAM_ECC_ON (elf_flags) ? "-msram-ecc=on"
+ : TEST_SRAM_ECC_ANY (elf_flags) ? "-msram-ecc=any"
+ : "-msram-ecc=off"));
for (int ix = 1; ix != argc; ix++)
{
@@ -1043,11 +1096,12 @@ main (int argc, char **argv)
obstack_ptr_grow (&ld_argv_obstack, gcn_s2_name);
obstack_ptr_grow (&ld_argv_obstack, "-lgomp");
obstack_ptr_grow (&ld_argv_obstack,
- (elf_flags & EF_AMDGPU_XNACK
+ (TEST_XNACK (elf_flags)
? "-mxnack" : "-mno-xnack"));
obstack_ptr_grow (&ld_argv_obstack,
- (elf_flags & EF_AMDGPU_SRAM_ECC
- ? "-msram-ecc=on" : "-msram-ecc=off"));
+ (TEST_SRAM_ECC_ON (elf_flags) ? "-msram-ecc=on"
+ : TEST_SRAM_ECC_ANY (elf_flags) ? "-msram-ecc=any"
+ : "-msram-ecc=off"));
if (verbose)
obstack_ptr_grow (&ld_argv_obstack, "-v");
diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h
index 4714696..29cf679 100644
--- a/gcc/config/i386/avx512fp16intrin.h
+++ b/gcc/config/i386/avx512fp16intrin.h
@@ -45,6 +45,14 @@ typedef _Float16 __m128h __attribute__ ((__vector_size__ (16), __may_alias__));
typedef _Float16 __m256h __attribute__ ((__vector_size__ (32), __may_alias__));
typedef _Float16 __m512h __attribute__ ((__vector_size__ (64), __may_alias__));
+/* Unaligned version of the same type. */
+typedef _Float16 __m128h_u __attribute__ ((__vector_size__ (16), \
+ __may_alias__, __aligned__ (1)));
+typedef _Float16 __m256h_u __attribute__ ((__vector_size__ (32), \
+ __may_alias__, __aligned__ (1)));
+typedef _Float16 __m512h_u __attribute__ ((__vector_size__ (64), \
+ __may_alias__, __aligned__ (1)));
+
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_ph (_Float16 __A7, _Float16 __A6, _Float16 __A5,
@@ -362,6 +370,48 @@ _mm_load_sh (void const *__P)
*(_Float16 const *) __P);
}
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_load_ph (void const *__P)
+{
+ return *(const __m512h *) __P;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_load_ph (void const *__P)
+{
+ return *(const __m256h *) __P;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_ph (void const *__P)
+{
+ return *(const __m128h *) __P;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_ph (void const *__P)
+{
+ return *(const __m512h_u *) __P;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_ph (void const *__P)
+{
+ return *(const __m256h_u *) __P;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_ph (void const *__P)
+{
+ return *(const __m128h_u *) __P;
+}
+
/* Stores the lower _Float16 value. */
extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -370,6 +420,56 @@ _mm_store_sh (void *__P, __m128h __A)
*(_Float16 *) __P = ((__v8hf)__A)[0];
}
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_store_ph (void *__P, __m512h __A)
+{
+ *(__m512h *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_store_ph (void *__P, __m256h __A)
+{
+ *(__m256h *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_ph (void *__P, __m128h __A)
+{
+ *(__m128h *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_ph (void *__P, __m512h __A)
+{
+ *(__m512h_u *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_ph (void *__P, __m256h __A)
+{
+ *(__m256h_u *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_ph (void *__P, __m128h __A)
+{
+ *(__m128h_u *) __P = __A;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_abs_ph (__m512h __A)
+{
+ return (__m512h) _mm512_and_epi32 ( _mm512_set1_epi32 (0x7FFF7FFF),
+ (__m512i) __A);
+}
+
/* Intrinsics v[add,sub,mul,div]ph. */
extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -621,6 +721,33 @@ _mm512_maskz_div_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
(A), (D)))
#endif /* __OPTIMIZE__ */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_conj_pch (__m512h __A)
+{
+ return (__m512h) _mm512_xor_epi32 ((__m512i) __A, _mm512_set1_epi32 (1<<31));
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_conj_pch (__m512h __W, __mmask16 __U, __m512h __A)
+{
+ return (__m512h)
+ __builtin_ia32_movaps512_mask ((__v16sf) _mm512_conj_pch (__A),
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_conj_pch (__mmask16 __U, __m512h __A)
+{
+ return (__m512h)
+ __builtin_ia32_movaps512_mask ((__v16sf) _mm512_conj_pch (__A),
+ (__v16sf) _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
/* Intrinsics of v[add,sub,mul,div]sh. */
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -6115,6 +6242,1006 @@ _mm_maskz_fnmsub_round_sh (__mmask8 __U, __m128h __W, __m128h __A,
#endif /* __OPTIMIZE__ */
+/* Intrinsics vf[,c]maddcph. */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fcmadd_pch (__m512h __A, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmaddcph512_round ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fcmadd_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D)
+{
+ return (__m512h) __builtin_ia32_movaps512_mask
+ ((__v16sf)
+ __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A,
+ (__v32hf) __C,
+ (__v32hf) __D, __B,
+ _MM_FROUND_CUR_DIRECTION),
+ (__v16sf) __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fcmadd_pch (__m512h __A, __m512h __B, __m512h __C, __mmask16 __D)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ __D, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fcmadd_pch (__mmask16 __A, __m512h __B, __m512h __C, __m512h __D)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmaddcph512_maskz_round ((__v32hf) __B,
+ (__v32hf) __C,
+ (__v32hf) __D,
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmadd_pch (__m512h __A, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddcph512_round ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D)
+{
+ return (__m512h) __builtin_ia32_movaps512_mask
+ ((__v16sf)
+ __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A,
+ (__v32hf) __C,
+ (__v32hf) __D, __B,
+ _MM_FROUND_CUR_DIRECTION),
+ (__v16sf) __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_pch (__m512h __A, __m512h __B, __m512h __C, __mmask16 __D)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ __D, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_pch (__mmask16 __A, __m512h __B, __m512h __C, __m512h __D)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddcph512_maskz_round ((__v32hf) __B,
+ (__v32hf) __C,
+ (__v32hf) __D,
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fcmadd_round_pch (__m512h __A, __m512h __B, __m512h __C, const int __D)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmaddcph512_round ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fcmadd_round_pch (__m512h __A, __mmask16 __B, __m512h __C,
+ __m512h __D, const int __E)
+{
+ return (__m512h) __builtin_ia32_movaps512_mask
+ ((__v16sf)
+ __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A,
+ (__v32hf) __C,
+ (__v32hf) __D, __B,
+ __E),
+ (__v16sf) __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fcmadd_round_pch (__m512h __A, __m512h __B, __m512h __C,
+ __mmask16 __D, const int __E)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ __D, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fcmadd_round_pch (__mmask16 __A, __m512h __B, __m512h __C,
+ __m512h __D, const int __E)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmaddcph512_maskz_round ((__v32hf) __B,
+ (__v32hf) __C,
+ (__v32hf) __D,
+ __A, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmadd_round_pch (__m512h __A, __m512h __B, __m512h __C, const int __D)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddcph512_round ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_round_pch (__m512h __A, __mmask16 __B, __m512h __C,
+ __m512h __D, const int __E)
+{
+ return (__m512h) __builtin_ia32_movaps512_mask
+ ((__v16sf)
+ __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A,
+ (__v32hf) __C,
+ (__v32hf) __D, __B,
+ __E),
+ (__v16sf) __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_round_pch (__m512h __A, __m512h __B, __m512h __C,
+ __mmask16 __D, const int __E)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ __D, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_round_pch (__mmask16 __A, __m512h __B, __m512h __C,
+ __m512h __D, const int __E)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddcph512_maskz_round ((__v32hf) __B,
+ (__v32hf) __C,
+ (__v32hf) __D,
+ __A, __E);
+}
+
+#else
+#define _mm512_fcmadd_round_pch(A, B, C, D) \
+ (__m512h) __builtin_ia32_vfcmaddcph512_round ((A), (B), (C), (D))
+
+#define _mm512_mask_fcmadd_round_pch(A, B, C, D, E) \
+ ((__m512h) __builtin_ia32_movaps512_mask ( \
+ (__v16sf) \
+ __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) (A), \
+ (__v32hf) (C), \
+ (__v32hf) (D), \
+ (B), (E)), \
+ (__v16sf) (A), (B)));
+
+
+#define _mm512_mask3_fcmadd_round_pch(A, B, C, D, E) \
+ ((__m512h) \
+ __builtin_ia32_vfcmaddcph512_mask_round ((A), (B), (C), (D), (E)))
+
+#define _mm512_maskz_fcmadd_round_pch(A, B, C, D, E) \
+ (__m512h) \
+ __builtin_ia32_vfcmaddcph512_maskz_round ((B), (C), (D), (A), (E))
+
+#define _mm512_fmadd_round_pch(A, B, C, D) \
+ (__m512h) __builtin_ia32_vfmaddcph512_round ((A), (B), (C), (D))
+
+#define _mm512_mask_fmadd_round_pch(A, B, C, D, E) \
+ ((__m512h) __builtin_ia32_movaps512_mask ( \
+ (__v16sf) \
+ __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) (A), \
+ (__v32hf) (C), \
+ (__v32hf) (D), \
+ (B), (E)), \
+ (__v16sf) (A), (B)));
+
+#define _mm512_mask3_fmadd_round_pch(A, B, C, D, E) \
+ (__m512h) \
+ __builtin_ia32_vfmaddcph512_mask_round ((A), (B), (C), (D), (E))
+
+#define _mm512_maskz_fmadd_round_pch(A, B, C, D, E) \
+ (__m512h) \
+ __builtin_ia32_vfmaddcph512_maskz_round ((B), (C), (D), (A), (E))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vf[,c]mulcph. */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fcmul_pch (__m512h __A, __m512h __B)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmulcph512_round ((__v32hf) __A,
+ (__v32hf) __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fcmul_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __C,
+ (__v32hf) __D,
+ (__v32hf) __A,
+ __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fcmul_pch (__mmask16 __A, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __B,
+ (__v32hf) __C,
+ _mm512_setzero_ph (),
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmul_pch (__m512h __A, __m512h __B)
+{
+ return (__m512h)
+ __builtin_ia32_vfmulcph512_round ((__v32hf) __A,
+ (__v32hf) __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmul_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D)
+{
+ return (__m512h)
+ __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __C,
+ (__v32hf) __D,
+ (__v32hf) __A,
+ __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmul_pch (__mmask16 __A, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __B,
+ (__v32hf) __C,
+ _mm512_setzero_ph (),
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fcmul_round_pch (__m512h __A, __m512h __B, const int __D)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmulcph512_round ((__v32hf) __A,
+ (__v32hf) __B, __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fcmul_round_pch (__m512h __A, __mmask16 __B, __m512h __C,
+ __m512h __D, const int __E)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __C,
+ (__v32hf) __D,
+ (__v32hf) __A,
+ __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fcmul_round_pch (__mmask16 __A, __m512h __B,
+ __m512h __C, const int __E)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __B,
+ (__v32hf) __C,
+ _mm512_setzero_ph (),
+ __A, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmul_round_pch (__m512h __A, __m512h __B, const int __D)
+{
+ return (__m512h)
+ __builtin_ia32_vfmulcph512_round ((__v32hf) __A,
+ (__v32hf) __B,
+ __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmul_round_pch (__m512h __A, __mmask16 __B, __m512h __C,
+ __m512h __D, const int __E)
+{
+ return (__m512h)
+ __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __C,
+ (__v32hf) __D,
+ (__v32hf) __A,
+ __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmul_round_pch (__mmask16 __A, __m512h __B,
+ __m512h __C, const int __E)
+{
+ return (__m512h)
+ __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __B,
+ (__v32hf) __C,
+ _mm512_setzero_ph (),
+ __A, __E);
+}
+
+#else
+#define _mm512_fcmul_round_pch(A, B, D) \
+ (__m512h) __builtin_ia32_vfcmulcph512_round ((A), (B), (D))
+
+#define _mm512_mask_fcmul_round_pch(A, B, C, D, E) \
+ (__m512h) __builtin_ia32_vfcmulcph512_mask_round ((C), (D), (A), (B), (E))
+
+#define _mm512_maskz_fcmul_round_pch(A, B, C, E) \
+ (__m512h) __builtin_ia32_vfcmulcph512_mask_round ((B), (C), \
+ (__v32hf) \
+ _mm512_setzero_ph (), \
+ (A), (E))
+
+#define _mm512_fmul_round_pch(A, B, D) \
+ (__m512h) __builtin_ia32_vfmulcph512_round ((A), (B), (D))
+
+#define _mm512_mask_fmul_round_pch(A, B, C, D, E) \
+ (__m512h) __builtin_ia32_vfmulcph512_mask_round ((C), (D), (A), (B), (E))
+
+#define _mm512_maskz_fmul_round_pch(A, B, C, E) \
+ (__m512h) __builtin_ia32_vfmulcph512_mask_round ((B), (C), \
+ (__v32hf) \
+ _mm512_setzero_ph (), \
+ (A), (E))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vf[,c]maddcsh. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmadd_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+#ifdef __AVX512VL__
+ return (__m128h) __builtin_ia32_movaps128_mask (
+ (__v4sf)
+ __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A,
+ (__v8hf) __C,
+ (__v8hf) __D, __B,
+ _MM_FROUND_CUR_DIRECTION),
+ (__v4sf) __A, __B);
+#else
+ return (__m128h) __builtin_ia32_blendvps ((__v4sf) __A,
+ (__v4sf)
+ __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A,
+ (__v8hf) __C,
+ (__v8hf) __D, __B,
+ _MM_FROUND_CUR_DIRECTION),
+ (__v4sf) _mm_set_ss ((float) ((int) __B << 31)));
+#endif
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fcmadd_sch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D)
+{
+ return (__m128h) _mm_move_ss ((__m128) __C,
+ (__m128)
+ __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C, __D,
+ _MM_FROUND_CUR_DIRECTION));
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmadd_sch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmaddcsh_maskz_round ((__v8hf) __B,
+ (__v8hf) __C,
+ (__v8hf) __D,
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmadd_sch (__m128h __A, __m128h __B, __m128h __C)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmaddcsh_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+#ifdef __AVX512VL__
+ return (__m128h) __builtin_ia32_movaps128_mask (
+ (__v4sf)
+ __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A,
+ (__v8hf) __C,
+ (__v8hf) __D, __B,
+ _MM_FROUND_CUR_DIRECTION),
+ (__v4sf) __A, __B);
+#else
+ return (__m128h) __builtin_ia32_blendvps ((__v4sf) __A,
+ (__v4sf)
+ __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A,
+ (__v8hf) __C,
+ (__v8hf) __D, __B,
+ _MM_FROUND_CUR_DIRECTION),
+ (__v4sf) _mm_set_ss ((float) ((int) __B << 31)));
+#endif
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_sch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D)
+{
+ return (__m128h) _mm_move_ss ((__m128) __C,
+ (__m128)
+ __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C, __D,
+ _MM_FROUND_CUR_DIRECTION));
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_sch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfmaddcsh_maskz_round ((__v8hf) __B,
+ (__v8hf) __C,
+ (__v8hf) __D,
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_sch (__m128h __A, __m128h __B, __m128h __C)
+{
+ return (__m128h)
+ __builtin_ia32_vfmaddcsh_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmadd_round_sch (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, const int __E)
+{
+#ifdef __AVX512VL__
+ return (__m128h) __builtin_ia32_movaps128_mask (
+ (__v4sf)
+ __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A,
+ (__v8hf) __C,
+ (__v8hf) __D,
+ __B, __E),
+ (__v4sf) __A, __B);
+#else
+ return (__m128h) __builtin_ia32_blendvps ((__v4sf) __A,
+ (__v4sf)
+ __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A,
+ (__v8hf) __C,
+ (__v8hf) __D,
+ __B, __E),
+ (__v4sf) _mm_set_ss ((float) ((int) __B << 31)));
+#endif
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fcmadd_round_sch (__m128h __A, __m128h __B, __m128h __C,
+ __mmask8 __D, const int __E)
+{
+ return (__m128h) _mm_move_ss ((__m128) __C,
+ (__m128)
+ __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ __D, __E));
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmadd_round_sch (__mmask8 __A, __m128h __B, __m128h __C,
+ __m128h __D, const int __E)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmaddcsh_maskz_round ((__v8hf) __B,
+ (__v8hf) __C,
+ (__v8hf) __D,
+ __A, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, const int __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmaddcsh_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_round_sch (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, const int __E)
+{
+#ifdef __AVX512VL__
+ return (__m128h) __builtin_ia32_movaps128_mask (
+ (__v4sf)
+ __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A,
+ (__v8hf) __C,
+ (__v8hf) __D,
+ __B, __E),
+ (__v4sf) __A, __B);
+#else
+ return (__m128h) __builtin_ia32_blendvps ((__v4sf) __A,
+ (__v4sf)
+ __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A,
+ (__v8hf) __C,
+ (__v8hf) __D,
+ __B, __E),
+ (__v4sf) _mm_set_ss ((float) ((int) __B << 31)));
+#endif
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_round_sch (__m128h __A, __m128h __B, __m128h __C,
+ __mmask8 __D, const int __E)
+{
+ return (__m128h) _mm_move_ss ((__m128) __C,
+ (__m128)
+ __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ __D, __E));
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_round_sch (__mmask8 __A, __m128h __B, __m128h __C,
+ __m128h __D, const int __E)
+{
+ return (__m128h)
+ __builtin_ia32_vfmaddcsh_maskz_round ((__v8hf) __B,
+ (__v8hf) __C,
+ (__v8hf) __D,
+ __A, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, const int __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfmaddcsh_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ __D);
+}
+#else
+#ifdef __AVX512VL__
+#define _mm_mask_fcmadd_round_sch(A, B, C, D, E) \
+ ((__m128h) __builtin_ia32_movaps128_mask ( \
+ (__v4sf) \
+ __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) (A), \
+ (__v8hf) (C), \
+ (__v8hf) (D), \
+ (B), (E)), \
+ (__v4sf) (A), (B)))
+
+#else
+#define _mm_mask_fcmadd_round_sch(A, B, C, D, E) \
+ ((__m128h) __builtin_ia32_blendvps ((__v4sf) (A), \
+ (__v4sf) \
+ __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) (A), \
+ (__v8hf) (C), \
+ (__v8hf) (D), \
+ (B), (E)), \
+ (__v4sf) _mm_set_ss ((float) ((int) (B) << 31))))
+#endif
+
+#define _mm_mask3_fcmadd_round_sch(A, B, C, D, E) \
+ ((__m128h) _mm_move_ss ((__m128) (C), \
+ (__m128) \
+ __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) (A), \
+ (__v8hf) (B), \
+ (__v8hf) (C), \
+ (D), (E))))
+
+#define _mm_maskz_fcmadd_round_sch(A, B, C, D, E) \
+ __builtin_ia32_vfcmaddcsh_maskz_round ((B), (C), (D), (A), (E))
+
+#define _mm_fcmadd_round_sch(A, B, C, D) \
+ __builtin_ia32_vfcmaddcsh_round ((A), (B), (C), (D))
+
+#ifdef __AVX512VL__
+#define _mm_mask_fmadd_round_sch(A, B, C, D, E) \
+ ((__m128h) __builtin_ia32_movaps128_mask ( \
+ (__v4sf) \
+ __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) (A), \
+ (__v8hf) (C), \
+ (__v8hf) (D), \
+ (B), (E)), \
+ (__v4sf) (A), (B)))
+
+#else
+#define _mm_mask_fmadd_round_sch(A, B, C, D, E) \
+ ((__m128h) __builtin_ia32_blendvps ((__v4sf) (A), \
+ (__v4sf) \
+ __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) (A), \
+ (__v8hf) (C), \
+ (__v8hf) (D), \
+ (B), (E)), \
+ (__v4sf) _mm_set_ss ((float) ((int) (B) << 31))))
+#endif
+
+#define _mm_mask3_fmadd_round_sch(A, B, C, D, E) \
+ ((__m128h) _mm_move_ss ((__m128) (C), \
+ (__m128) \
+ __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) (A), \
+ (__v8hf) (B), \
+ (__v8hf) (C), \
+ (D), (E))))
+
+#define _mm_maskz_fmadd_round_sch(A, B, C, D, E) \
+ __builtin_ia32_vfmaddcsh_maskz_round ((B), (C), (D), (A), (E))
+
+#define _mm_fmadd_round_sch(A, B, C, D) \
+ __builtin_ia32_vfmaddcsh_round ((A), (B), (C), (D))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vf[,c]mulcsh. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmul_sch (__m128h __A, __m128h __B)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmulcsh_round ((__v8hf) __A,
+ (__v8hf) __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmul_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __C,
+ (__v8hf) __D,
+ (__v8hf) __A,
+ __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmul_sch (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __B,
+ (__v8hf) __C,
+ _mm_setzero_ph (),
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmul_sch (__m128h __A, __m128h __B)
+{
+ return (__m128h)
+ __builtin_ia32_vfmulcsh_round ((__v8hf) __A,
+ (__v8hf) __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmul_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __C,
+ (__v8hf) __D,
+ (__v8hf) __A,
+ __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmul_sch (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return (__m128h)
+ __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __B,
+ (__v8hf) __C,
+ _mm_setzero_ph (),
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmul_round_sch (__m128h __A, __m128h __B, const int __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmulcsh_round ((__v8hf) __A,
+ (__v8hf) __B,
+ __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmul_round_sch (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, const int __E)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __C,
+ (__v8hf) __D,
+ (__v8hf) __A,
+ __B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmul_round_sch (__mmask8 __A, __m128h __B, __m128h __C,
+ const int __E)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __B,
+ (__v8hf) __C,
+ _mm_setzero_ph (),
+ __A, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmul_round_sch (__m128h __A, __m128h __B, const int __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfmulcsh_round ((__v8hf) __A,
+ (__v8hf) __B, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmul_round_sch (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, const int __E)
+{
+ return (__m128h)
+ __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __C,
+ (__v8hf) __D,
+ (__v8hf) __A,
+ __B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmul_round_sch (__mmask8 __A, __m128h __B, __m128h __C, const int __E)
+{
+ return (__m128h)
+ __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __B,
+ (__v8hf) __C,
+ _mm_setzero_ph (),
+ __A, __E);
+}
+
+#else
+#define _mm_fcmul_round_sch(__A, __B, __D) \
+ (__m128h) __builtin_ia32_vfcmulcsh_round ((__v8hf) __A, \
+ (__v8hf) __B, __D)
+
+#define _mm_mask_fcmul_round_sch(__A, __B, __C, __D, __E) \
+ (__m128h) __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __C, \
+ (__v8hf) __D, \
+ (__v8hf) __A, \
+ __B, __E)
+
+#define _mm_maskz_fcmul_round_sch(__A, __B, __C, __E) \
+ (__m128h) __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __B, \
+ (__v8hf) __C, \
+ _mm_setzero_ph (), \
+ __A, __E)
+
+#define _mm_fmul_round_sch(__A, __B, __D) \
+ (__m128h) __builtin_ia32_vfmulcsh_round ((__v8hf) __A, \
+ (__v8hf) __B, __D)
+
+#define _mm_mask_fmul_round_sch(__A, __B, __C, __D, __E) \
+ (__m128h) __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __C, \
+ (__v8hf) __D, \
+ (__v8hf) __A, \
+ __B, __E)
+
+#define _mm_maskz_fmul_round_sch(__A, __B, __C, __E) \
+ (__m128h) __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __B, \
+ (__v8hf) __C, \
+ _mm_setzero_ph (), \
+ __A, __E)
+
+#endif /* __OPTIMIZE__ */
+
+#define _MM512_REDUCE_OP(op) \
+ __m256h __T1 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 0); \
+ __m256h __T2 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 1); \
+ __m256h __T3 = (__T1 op __T2); \
+ __m128h __T4 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 0); \
+ __m128h __T5 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 1); \
+ __m128h __T6 = (__T4 op __T5); \
+ __m128h __T7 = (__m128h) __builtin_shuffle ((__m128h)__T6, \
+ (__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 }); \
+ __m128h __T8 = (__T6 op __T7); \
+ __m128h __T9 = (__m128h) __builtin_shuffle ((__m128h)__T8, \
+ (__v8hi) { 2, 3, 0, 1, 4, 5, 6, 7 }); \
+ __m128h __T10 = __T8 op __T9; \
+ return __T10[0] op __T10[1]
+
+// TODO reduce
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_add_ph (__m512h __A)
+{
+ _MM512_REDUCE_OP (+);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_mul_ph (__m512h __A)
+{
+ _MM512_REDUCE_OP (*);
+}
+
+#undef _MM512_REDUCE_OP
+
+#ifdef __AVX512VL__
+
+#define _MM512_REDUCE_OP(op) \
+ __m256h __T1 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 0); \
+ __m256h __T2 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 1); \
+ __m256h __T3 = __builtin_ia32_##op##ph256_mask (__T1, __T2, \
+ _mm256_setzero_ph (), (__mmask16) -1); \
+ __m128h __T4 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 0); \
+ __m128h __T5 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 1); \
+ __m128h __T6 = __builtin_ia32_##op##ph128_mask \
+ (__T4, __T5, _mm_setzero_ph (),(__mmask8) -1); \
+ __m128h __T7 = (__m128h) __builtin_shuffle ((__m128h)__T6, \
+ (__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 }); \
+ __m128h __T8 = (__m128h) __builtin_ia32_##op##ph128_mask \
+ (__T6, __T7, _mm_setzero_ph (),(__mmask8) -1); \
+ __m128h __T9 = (__m128h) __builtin_shuffle ((__m128h)__T8, \
+ (__v8hi) { 4, 5 }); \
+ __m128h __T10 = __builtin_ia32_##op##ph128_mask \
+ (__T8, __T9, _mm_setzero_ph (),(__mmask8) -1); \
+ __m128h __T11 = (__m128h) __builtin_shuffle (__T10, \
+ (__v8hi) { 1, 0 }); \
+ __m128h __T12 = __builtin_ia32_##op##ph128_mask \
+ (__T10, __T11, _mm_setzero_ph (),(__mmask8) -1); \
+ return __T12[0]
+
+#else
+
+#define _MM512_REDUCE_OP(op) \
+ __m512h __T1 = (__m512h) __builtin_shuffle ((__m512d) __A, \
+ (__v8di) { 4, 5, 6, 7, 0, 0, 0, 0 }); \
+ __m512h __T2 = _mm512_##op##_ph (__A, __T1); \
+ __m512h __T3 = (__m512h) __builtin_shuffle ((__m512d) __T2, \
+ (__v8di) { 2, 3, 0, 0, 0, 0, 0, 0 }); \
+ __m512h __T4 = _mm512_##op##_ph (__T2, __T3); \
+ __m512h __T5 = (__m512h) __builtin_shuffle ((__m512d) __T4, \
+ (__v8di) { 1, 0, 0, 0, 0, 0, 0, 0 }); \
+ __m512h __T6 = _mm512_##op##_ph (__T4, __T5); \
+ __m512h __T7 = (__m512h) __builtin_shuffle ((__m512) __T6, \
+ (__v16si) { 1, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0 }); \
+ __m512h __T8 = _mm512_##op##_ph (__T6, __T7); \
+ __m512h __T9 = (__m512h) __builtin_shuffle (__T8, \
+ (__v32hi) { 1, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0 }); \
+ __m512h __T10 = _mm512_##op##_ph (__T8, __T9); \
+ return __T10[0]
+#endif
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_min_ph (__m512h __A)
+{
+ _MM512_REDUCE_OP (min);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_max_ph (__m512h __A)
+{
+ _MM512_REDUCE_OP (max);
+}
+
+#undef _MM512_REDUCE_OP
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_blend_ph (__mmask32 __U, __m512h __A, __m512h __W)
+{
+ return (__m512h) __builtin_ia32_movdquhi512_mask ((__v32hi) __W,
+ (__v32hi) __A,
+ (__mmask32) __U);
+
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex2var_ph (__m512h __A, __m512i __I, __m512h __B)
+{
+ return (__m512h) __builtin_ia32_vpermi2varhi512_mask ((__v32hi) __A,
+ (__v32hi) __I,
+ (__v32hi) __B,
+ (__mmask32)-1);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutexvar_ph (__m512i __A, __m512h __B)
+{
+ return (__m512h) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+ (__v32hi) __A,
+ (__v32hi)
+ (_mm512_setzero_ph ()),
+ (__mmask32)-1);
+}
+
#ifdef __DISABLE_AVX512FP16__
#undef __DISABLE_AVX512FP16__
#pragma GCC pop_options
diff --git a/gcc/config/i386/avx512fp16vlintrin.h b/gcc/config/i386/avx512fp16vlintrin.h
index 1292c02..3d3de96 100644
--- a/gcc/config/i386/avx512fp16vlintrin.h
+++ b/gcc/config/i386/avx512fp16vlintrin.h
@@ -151,6 +151,59 @@ _mm256_zextph128_ph256 (__m128h __A)
(__m128) __A, 0);
}
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_conj_pch (__m256h __A)
+{
+ return (__m256h) _mm256_xor_epi32 ((__m256i) __A, _mm256_set1_epi32 (1<<31));
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_conj_pch (__m256h __W, __mmask8 __U, __m256h __A)
+{
+ return (__m256h) __builtin_ia32_movaps256_mask ((__v8sf)
+ _mm256_conj_pch (__A),
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_conj_pch (__mmask8 __U, __m256h __A)
+{
+ return (__m256h) __builtin_ia32_movaps256_mask ((__v8sf)
+ _mm256_conj_pch (__A),
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_conj_pch (__m128h __A)
+{
+ return (__m128h) _mm_xor_epi32 ((__m128i) __A, _mm_set1_epi32 (1<<31));
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_conj_pch (__m128h __W, __mmask8 __U, __m128h __A)
+{
+ return (__m128h) __builtin_ia32_movaps128_mask ((__v4sf) _mm_conj_pch (__A),
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_conj_pch (__mmask8 __U, __m128h __A)
+{
+ return (__m128h) __builtin_ia32_movaps128_mask ((__v4sf) _mm_conj_pch (__A),
+ (__v4sf) _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
/* Intrinsics v[add,sub,mul,div]ph. */
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -425,6 +478,22 @@ _mm256_maskz_min_ph (__mmask16 __A, __m256h __B, __m256h __C)
_mm256_setzero_ph (), __A);
}
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_ph (__m128h __A)
+{
+ return (__m128h) _mm_and_si128 ( _mm_set1_epi32 (0x7FFF7FFF),
+ (__m128i) __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_abs_ph (__m256h __A)
+{
+ return (__m256h) _mm256_and_si256 ( _mm256_set1_epi32 (0x7FFF7FFF),
+ (__m256i) __A);
+}
+
/* vcmpph */
#ifdef __OPTIMIZE
extern __inline __mmask8
@@ -2815,6 +2884,437 @@ _mm_maskz_fnmsub_ph (__mmask8 __U, __m128h __A, __m128h __B,
__U);
}
+/* Intrinsics vf[,c]maddcph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_pch (__m128h __A, __m128h __B, __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfmaddcph128 ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return (__m128h) __builtin_ia32_movaps128_mask
+ ((__v4sf)
+ __builtin_ia32_vfmaddcph128_mask ((__v8hf) __A,
+ (__v8hf) __C,
+ (__v8hf) __D, __B),
+ (__v4sf) __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_pch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D)
+{
+ return (__m128h) __builtin_ia32_vfmaddcph128_mask ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_pch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D)
+{
+ return (__m128h) __builtin_ia32_vfmaddcph128_maskz ((__v8hf) __B,
+ (__v8hf) __C,
+ (__v8hf) __D, __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmadd_pch (__m256h __A, __m256h __B, __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfmaddcph256 ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmadd_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D)
+{
+ return (__m256h) __builtin_ia32_movaps256_mask
+ ((__v8sf)
+ __builtin_ia32_vfmaddcph256_mask ((__v16hf) __A,
+ (__v16hf) __C,
+ (__v16hf) __D, __B),
+ (__v8sf) __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmadd_pch (__m256h __A, __m256h __B, __m256h __C, __mmask8 __D)
+{
+ return (__m256h) __builtin_ia32_vfmaddcph256_mask ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C, __D);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmadd_pch (__mmask8 __A, __m256h __B, __m256h __C, __m256h __D)
+{
+ return (__m256h)__builtin_ia32_vfmaddcph256_maskz ((__v16hf) __B,
+ (__v16hf) __C,
+ (__v16hf) __D, __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmadd_pch (__m128h __A, __m128h __B, __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfcmaddcph128 ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmadd_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return (__m128h) __builtin_ia32_movaps128_mask
+ ((__v4sf)
+ __builtin_ia32_vfcmaddcph128_mask ((__v8hf) __A,
+ (__v8hf) __C,
+ (__v8hf) __D, __B),
+ (__v4sf) __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fcmadd_pch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D)
+{
+ return (__m128h) __builtin_ia32_vfcmaddcph128_mask ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmadd_pch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D)
+{
+ return (__m128h)__builtin_ia32_vfcmaddcph128_maskz ((__v8hf) __B,
+ (__v8hf) __C,
+ (__v8hf) __D, __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fcmadd_pch (__m256h __A, __m256h __B, __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfcmaddcph256 ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fcmadd_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D)
+{
+ return (__m256h) __builtin_ia32_movaps256_mask
+ ((__v8sf)
+ __builtin_ia32_vfcmaddcph256_mask ((__v16hf) __A,
+ (__v16hf) __C,
+ (__v16hf) __D, __B),
+ (__v8sf) __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fcmadd_pch (__m256h __A, __m256h __B, __m256h __C, __mmask8 __D)
+{
+ return (__m256h) __builtin_ia32_vfcmaddcph256_mask ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C, __D);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fcmadd_pch (__mmask8 __A, __m256h __B, __m256h __C, __m256h __D)
+{
+ return (__m256h) __builtin_ia32_vfcmaddcph256_maskz ((__v16hf) __B,
+ (__v16hf) __C,
+ (__v16hf) __D, __A);
+}
+
+/* Intrinsics vf[,c]mulcph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmul_pch (__m128h __A, __m128h __B)
+{
+ return (__m128h) __builtin_ia32_vfmulcph128 ((__v8hf) __A, (__v8hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmul_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return (__m128h) __builtin_ia32_vfmulcph128_mask ((__v8hf) __C,
+ (__v8hf) __D,
+ (__v8hf) __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmul_pch (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfmulcph128_mask ((__v8hf) __B,
+ (__v8hf) __C,
+ _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmul_pch (__m256h __A, __m256h __B)
+{
+ return (__m256h) __builtin_ia32_vfmulcph256 ((__v16hf) __A,
+ (__v16hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmul_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D)
+{
+ return (__m256h) __builtin_ia32_vfmulcph256_mask ((__v16hf) __C,
+ (__v16hf) __D,
+ (__v16hf) __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmul_pch (__mmask8 __A, __m256h __B, __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfmulcph256_mask ((__v16hf) __B,
+ (__v16hf) __C,
+ _mm256_setzero_ph (),
+ __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmul_pch (__m128h __A, __m128h __B)
+{
+ return (__m128h) __builtin_ia32_vfcmulcph128 ((__v8hf) __A,
+ (__v8hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmul_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return (__m128h) __builtin_ia32_vfcmulcph128_mask ((__v8hf) __C,
+ (__v8hf) __D,
+ (__v8hf) __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmul_pch (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfcmulcph128_mask ((__v8hf) __B,
+ (__v8hf) __C,
+ _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fcmul_pch (__m256h __A, __m256h __B)
+{
+ return (__m256h) __builtin_ia32_vfcmulcph256 ((__v16hf) __A, (__v16hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fcmul_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D)
+{
+ return (__m256h) __builtin_ia32_vfcmulcph256_mask ((__v16hf) __C,
+ (__v16hf) __D,
+ (__v16hf) __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fcmul_pch (__mmask8 __A, __m256h __B, __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfcmulcph256_mask ((__v16hf) __B,
+ (__v16hf) __C,
+ _mm256_setzero_ph (),
+ __A);
+}
+
+#define _MM256_REDUCE_OP(op) \
+ __m128h __T1 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 0); \
+ __m128h __T2 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 1); \
+ __m128h __T3 = (__T1 op __T2); \
+ __m128h __T4 = (__m128h) __builtin_shuffle (__T3, \
+ (__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 }); \
+ __m128h __T5 = (__T3) op (__T4); \
+ __m128h __T6 = (__m128h) __builtin_shuffle (__T5, \
+ (__v8hi) { 2, 3, 0, 1, 4, 5, 6, 7 }); \
+ __m128h __T7 = __T5 op __T6; \
+ return __T7[0] op __T7[1]
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_add_ph (__m256h __A)
+{
+ _MM256_REDUCE_OP (+);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_mul_ph (__m256h __A)
+{
+ _MM256_REDUCE_OP (*);
+}
+
+#undef _MM256_REDUCE_OP
+#define _MM256_REDUCE_OP(op) \
+ __m128h __T1 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 0); \
+ __m128h __T2 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 1); \
+ __m128h __T3 = _mm_##op (__T1, __T2); \
+ __m128h __T4 = (__m128h) __builtin_shuffle (__T3, \
+ (__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 }); \
+ __m128h __T5 = _mm_##op (__T3, __T4); \
+ __m128h __T6 = (__m128h) __builtin_shuffle (__T5, (__v8hi) { 4, 5 }); \
+ __m128h __T7 = _mm_##op (__T5, __T6); \
+ __m128h __T8 = (__m128h) __builtin_shuffle (__T7, (__v8hi) { 1, 0 }); \
+ __m128h __T9 = _mm_##op (__T7, __T8); \
+ return __T9[0]
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_min_ph (__m256h __A)
+{
+ _MM256_REDUCE_OP (min_ph);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_max_ph (__m256h __A)
+{
+ _MM256_REDUCE_OP (max_ph);
+}
+
+#define _MM_REDUCE_OP(op) \
+ __m128h __T1 = (__m128h) __builtin_shuffle (__A, \
+ (__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 }); \
+ __m128h __T2 = (__A) op (__T1); \
+ __m128h __T3 = (__m128h) __builtin_shuffle (__T2, \
+ (__v8hi){ 2, 3, 0, 1, 4, 5, 6, 7 }); \
+ __m128h __T4 = __T2 op __T3; \
+ return __T4[0] op __T4[1]
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_add_ph (__m128h __A)
+{
+ _MM_REDUCE_OP (+);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_mul_ph (__m128h __A)
+{
+ _MM_REDUCE_OP (*);
+}
+
+#undef _MM_REDUCE_OP
+#define _MM_REDUCE_OP(op) \
+ __m128h __T1 = (__m128h) __builtin_shuffle (__A, \
+ (__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 }); \
+ __m128h __T2 = _mm_##op (__A, __T1); \
+ __m128h __T3 = (__m128h) __builtin_shuffle (__T2, (__v8hi){ 4, 5 }); \
+ __m128h __T4 = _mm_##op (__T2, __T3); \
+ __m128h __T5 = (__m128h) __builtin_shuffle (__T4, (__v8hi){ 1, 0 }); \
+ __m128h __T6 = _mm_##op (__T4, __T5); \
+ return __T6[0]
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_min_ph (__m128h __A)
+{
+ _MM_REDUCE_OP (min_ph);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_max_ph (__m128h __A)
+{
+ _MM_REDUCE_OP (max_ph);
+}
+
+#undef _MM256_REDUCE_OP
+#undef _MM_REDUCE_OP
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_ph (__mmask16 __U, __m256h __A, __m256h __W)
+{
+ return (__m256h) __builtin_ia32_movdquhi256_mask ((__v16hi) __W,
+ (__v16hi) __A,
+ (__mmask16) __U);
+
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutex2var_ph (__m256h __A, __m256i __I, __m256h __B)
+{
+ return (__m256h) __builtin_ia32_vpermi2varhi256_mask ((__v16hi) __A,
+ (__v16hi) __I,
+ (__v16hi) __B,
+ (__mmask16)-1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutexvar_ph (__m256i __A, __m256h __B)
+{
+ return (__m256h) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+ (__v16hi) __A,
+ (__v16hi)
+ (_mm256_setzero_ph ()),
+ (__mmask16)-1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_ph (__mmask8 __U, __m128h __A, __m128h __W)
+{
+ return (__m128h) __builtin_ia32_movdquhi128_mask ((__v8hi) __W,
+ (__v8hi) __A,
+ (__mmask8) __U);
+
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutex2var_ph (__m128h __A, __m128i __I, __m128h __B)
+{
+ return (__m128h) __builtin_ia32_vpermi2varhi128_mask ((__v8hi) __A,
+ (__v8hi) __I,
+ (__v8hi) __B,
+ (__mmask8)-1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutexvar_ph (__m128i __A, __m128h __B)
+{
+ return (__m128h) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+ (__v8hi) __A,
+ (__v8hi)
+ (_mm_setzero_ph ()),
+ (__mmask8)-1);
+}
+
#ifdef __DISABLE_AVX512FP16VL__
#undef __DISABLE_AVX512FP16VL__
#pragma GCC pop_options
diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index 5eae4d0..4c355c5 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -1348,6 +1348,7 @@ DEF_FUNCTION_TYPE (V8DI, V8HF, V8DI, UQI, INT)
DEF_FUNCTION_TYPE (V8DF, V8HF, V8DF, UQI, INT)
DEF_FUNCTION_TYPE (V8HF, V8DI, V8HF, UQI, INT)
DEF_FUNCTION_TYPE (V8HF, V8DF, V8HF, UQI, INT)
+DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF)
DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF, UQI, INT)
DEF_FUNCTION_TYPE (V8HF, V2DF, V8HF, V8HF, UQI, INT)
DEF_FUNCTION_TYPE (V8HF, V4SF, V8HF, V8HF, UQI, INT)
@@ -1358,12 +1359,14 @@ DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF)
DEF_FUNCTION_TYPE (V16HI, V16HF, V16HI, UHI)
DEF_FUNCTION_TYPE (V16HF, V16HI, V16HF, UHI)
DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, UHI)
+DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, V16HF)
DEF_FUNCTION_TYPE (V16SI, V16HF, V16SI, UHI, INT)
DEF_FUNCTION_TYPE (V16SF, V16HF, V16SF, UHI, INT)
DEF_FUNCTION_TYPE (V16HF, V16HF, INT, V16HF, UHI)
DEF_FUNCTION_TYPE (UHI, V16HF, V16HF, INT, UHI)
DEF_FUNCTION_TYPE (V16HF, V16SI, V16HF, UHI, INT)
DEF_FUNCTION_TYPE (V16HF, V16SF, V16HF, UHI, INT)
+DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, V16HF, UQI)
DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, V16HF, UHI)
DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, USI)
DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, INT)
@@ -1371,7 +1374,9 @@ DEF_FUNCTION_TYPE (V32HI, V32HF, V32HI, USI, INT)
DEF_FUNCTION_TYPE (V32HF, V32HI, V32HF, USI, INT)
DEF_FUNCTION_TYPE (USI, V32HF, V32HF, INT, USI)
DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, USI, INT)
+DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, INT)
DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI)
DEF_FUNCTION_TYPE (USI, V32HF, V32HF, INT, USI, INT)
+DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, UHI, INT)
DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI, INT)
DEF_FUNCTION_TYPE (V32HF, V32HF, INT, V32HF, USI, INT)
diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 5950d5e..302e1bc 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -2911,6 +2911,26 @@ BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_
BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fnmsub_v8hf_mask, "__builtin_ia32_vfnmsubph128_mask", IX86_BUILTIN_VFNMSUBPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fnmsub_v8hf_mask3, "__builtin_ia32_vfnmsubph128_mask3", IX86_BUILTIN_VFNMSUBPH128_MASK3, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fnmsub_v8hf_maskz, "__builtin_ia32_vfnmsubph128_maskz", IX86_BUILTIN_VFNMSUBPH128_MASKZ, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fmaddc_v8hf, "__builtin_ia32_vfmaddcph128", IX86_BUILTIN_VFMADDCPH_V8HF, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddc_v8hf_mask, "__builtin_ia32_vfmaddcph128_mask", IX86_BUILTIN_VFMADDCPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddc_v8hf_maskz, "__builtin_ia32_vfmaddcph128_maskz", IX86_BUILTIN_VFMADDCPH128_MASKZ, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fmaddc_v16hf, "__builtin_ia32_vfmaddcph256", IX86_BUILTIN_VFMADDCPH_V16HF, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fmaddc_v16hf_mask, "__builtin_ia32_vfmaddcph256_mask", IX86_BUILTIN_VFMADDCPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fmaddc_v16hf_maskz, "__builtin_ia32_vfmaddcph256_maskz", IX86_BUILTIN_VFMADDCPH256_MASKZ, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fcmaddc_v8hf, "__builtin_ia32_vfcmaddcph128", IX86_BUILTIN_VFCMADDCPH_V8HF, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddc_v8hf_mask, "__builtin_ia32_vfcmaddcph128_mask", IX86_BUILTIN_VFCMADDCPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddc_v8hf_maskz, "__builtin_ia32_vfcmaddcph128_maskz", IX86_BUILTIN_VFCMADDCPH128_MASKZ, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fcmaddc_v16hf, "__builtin_ia32_vfcmaddcph256", IX86_BUILTIN_VFCMADDCPH_V16HF, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fcmaddc_v16hf_mask, "__builtin_ia32_vfcmaddcph256_mask", IX86_BUILTIN_VFCMADDCPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fcmaddc_v16hf_maskz, "__builtin_ia32_vfcmaddcph256_maskz", IX86_BUILTIN_VFCMADDCPH256_MASKZ, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmulc_v8hf, "__builtin_ia32_vfcmulcph128", IX86_BUILTIN_VFCMULCPH_V8HF, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmulc_v8hf_mask, "__builtin_ia32_vfcmulcph128_mask", IX86_BUILTIN_VFCMULCPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fcmulc_v16hf, "__builtin_ia32_vfcmulcph256", IX86_BUILTIN_VFCMULCPH_V16HF, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fcmulc_v16hf_mask, "__builtin_ia32_vfcmulcph256_mask", IX86_BUILTIN_VFCMULCPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmulc_v8hf, "__builtin_ia32_vfmulcph128", IX86_BUILTIN_VFMULCPH_V8HF, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmulc_v8hf_mask, "__builtin_ia32_vfmulcph128_mask", IX86_BUILTIN_VFMULCPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fmulc_v16hf, "__builtin_ia32_vfmulcph256", IX86_BUILTIN_VFMULCPH_V16HF, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fmulc_v16hf_mask, "__builtin_ia32_vfmulcph256_mask", IX86_BUILTIN_VFMULCPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI)
/* Builtins with rounding support. */
BDESC_END (ARGS, ROUND_ARGS)
@@ -3201,6 +3221,26 @@ BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmfnmadd_v8hf_mask_round
BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmfnmadd_v8hf_mask3_round, "__builtin_ia32_vfnmaddsh3_mask3", IX86_BUILTIN_VFNMADDSH3_MASK3, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmfnmadd_v8hf_maskz_round, "__builtin_ia32_vfnmaddsh3_maskz", IX86_BUILTIN_VFNMADDSH3_MASKZ, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmfmsub_v8hf_mask3_round, "__builtin_ia32_vfmsubsh3_mask3", IX86_BUILTIN_VFMSUBSH3_MASK3, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fmaddc_v32hf_round, "__builtin_ia32_vfmaddcph512_round", IX86_BUILTIN_VFMADDCPH512_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmaddc_v32hf_mask_round, "__builtin_ia32_vfmaddcph512_mask_round", IX86_BUILTIN_VFMADDCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmaddc_v32hf_maskz_round, "__builtin_ia32_vfmaddcph512_maskz_round", IX86_BUILTIN_VFMADDCPH512_MASKZ_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fcmaddc_v32hf_round, "__builtin_ia32_vfcmaddcph512_round", IX86_BUILTIN_VFCMADDCPH512_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmaddc_v32hf_mask_round, "__builtin_ia32_vfcmaddcph512_mask_round", IX86_BUILTIN_VFCMADDCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmaddc_v32hf_maskz_round, "__builtin_ia32_vfcmaddcph512_maskz_round", IX86_BUILTIN_VFCMADDCPH512_MASKZ_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmulc_v32hf_round, "__builtin_ia32_vfcmulcph512_round", IX86_BUILTIN_VFCMULCPH512_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmulc_v32hf_mask_round, "__builtin_ia32_vfcmulcph512_mask_round", IX86_BUILTIN_VFCMULCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmulc_v32hf_round, "__builtin_ia32_vfmulcph512_round", IX86_BUILTIN_VFMULCPH512_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmulc_v32hf_mask_round, "__builtin_ia32_vfmulcph512_mask_round", IX86_BUILTIN_VFMULCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fma_fcmaddcsh_v8hf_round, "__builtin_ia32_vfcmaddcsh_round", IX86_BUILTIN_VFCMADDCSH_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask_round, "__builtin_ia32_vfcmaddcsh_mask_round", IX86_BUILTIN_VFCMADDCSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddcsh_v8hf_maskz_round, "__builtin_ia32_vfcmaddcsh_maskz_round", IX86_BUILTIN_VFCMADDCSH_MASKZ_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fma_fmaddcsh_v8hf_round, "__builtin_ia32_vfmaddcsh_round", IX86_BUILTIN_VFMADDCSH_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask_round, "__builtin_ia32_vfmaddcsh_mask_round", IX86_BUILTIN_VFMADDCSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddcsh_v8hf_maskz_round, "__builtin_ia32_vfmaddcsh_maskz_round", IX86_BUILTIN_VFMADDCSH_MASKZ_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmulcsh_v8hf_round, "__builtin_ia32_vfcmulcsh_round", IX86_BUILTIN_VFCMULCSH_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmulcsh_v8hf_mask_round, "__builtin_ia32_vfcmulcsh_mask_round", IX86_BUILTIN_VFCMULCSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmulcsh_v8hf_round, "__builtin_ia32_vfmulcsh_round", IX86_BUILTIN_VFMULCSH_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmulcsh_v8hf_mask_round, "__builtin_ia32_vfmulcsh_mask_round", IX86_BUILTIN_VFMULCSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
BDESC_END (ROUND_ARGS, MULTI_ARG)
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index c88cb14..4780b99 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -3638,6 +3638,8 @@ ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
return false;
else if (vector_size == 64)
return true;
+ else if (GET_MODE_INNER (cmp_mode) == HFmode)
+ return true;
/* When op_true is NULL, op_false must be NULL, or vice versa. */
gcc_assert (!op_true == !op_false);
@@ -9762,6 +9764,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case V2DI_FTYPE_V8HF_V2DI_UQI:
case V2DI_FTYPE_V4SF_V2DI_UQI:
case V8HF_FTYPE_V8HF_V8HF_UQI:
+ case V8HF_FTYPE_V8HF_V8HF_V8HF:
case V8HF_FTYPE_V8HI_V8HF_UQI:
case V8HF_FTYPE_V8SI_V8HF_UQI:
case V8HF_FTYPE_V8SF_V8HF_UQI:
@@ -9840,6 +9843,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case V16SF_FTYPE_V8SF_V16SF_UHI:
case V16SI_FTYPE_V8SI_V16SI_UHI:
case V16HF_FTYPE_V16HI_V16HF_UHI:
+ case V16HF_FTYPE_V16HF_V16HF_V16HF:
case V16HI_FTYPE_V16HF_V16HI_UHI:
case V16HI_FTYPE_V16HI_V16HI_UHI:
case V8HI_FTYPE_V16QI_V8HI_UQI:
@@ -9996,6 +10000,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
+ case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI:
case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
@@ -10725,6 +10730,7 @@ ix86_expand_round_builtin (const struct builtin_description *d,
case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
case V32HF_FTYPE_V32HI_V32HF_USI_INT:
case V32HF_FTYPE_V32HF_V32HF_USI_INT:
+ case V32HF_FTYPE_V32HF_V32HF_V32HF_INT:
case V16SF_FTYPE_V16SF_V16SF_HI_INT:
case V8DI_FTYPE_V8SF_V8DI_QI_INT:
case V16SF_FTYPE_V16SI_V16SF_HI_INT:
@@ -10754,6 +10760,7 @@ ix86_expand_round_builtin (const struct builtin_description *d,
case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
+ case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT:
case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
@@ -16038,6 +16045,7 @@ emit_reduc_half (rtx dest, rtx src, int i)
break;
case E_V16QImode:
case E_V8HImode:
+ case E_V8HFmode:
case E_V4SImode:
case E_V2DImode:
d = gen_reg_rtx (V1TImode);
@@ -16059,6 +16067,7 @@ emit_reduc_half (rtx dest, rtx src, int i)
break;
case E_V32QImode:
case E_V16HImode:
+ case E_V16HFmode:
case E_V8SImode:
case E_V4DImode:
if (i == 256)
@@ -16078,6 +16087,7 @@ emit_reduc_half (rtx dest, rtx src, int i)
break;
case E_V64QImode:
case E_V32HImode:
+ case E_V32HFmode:
if (i < 64)
{
d = gen_reg_rtx (V4TImode);
diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c
index 14f816f..43bb676 100644
--- a/gcc/config/i386/i386-features.c
+++ b/gcc/config/i386/i386-features.c
@@ -2258,15 +2258,22 @@ remove_partial_avx_dependency (void)
rtx zero;
machine_mode dest_vecmode;
- if (dest_mode == E_SFmode)
+ switch (dest_mode)
{
+ case E_HFmode:
+ dest_vecmode = V8HFmode;
+ zero = gen_rtx_SUBREG (V8HFmode, v4sf_const0, 0);
+ break;
+ case E_SFmode:
dest_vecmode = V4SFmode;
zero = v4sf_const0;
- }
- else
- {
+ break;
+ case E_DFmode:
dest_vecmode = V2DFmode;
zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
+ break;
+ default:
+ gcc_unreachable ();
}
/* Change source to vector mode. */
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ba89e11..a566d84 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2462,6 +2462,8 @@ classify_argument (machine_mode mode, const_tree type,
case E_V2SFmode:
case E_V2SImode:
case E_V4HImode:
+ case E_V4HFmode:
+ case E_V2HFmode:
case E_V8QImode:
classes[0] = X86_64_SSE_CLASS;
return 1;
@@ -2902,6 +2904,7 @@ pass_in_reg:
case E_V8QImode:
case E_V4HImode:
+ case E_V4HFmode:
case E_V2SImode:
case E_V2SFmode:
case E_V1TImode:
@@ -3149,6 +3152,7 @@ pass_in_reg:
case E_V8QImode:
case E_V4HImode:
+ case E_V4HFmode:
case E_V2SImode:
case E_V2SFmode:
case E_V1TImode:
@@ -5035,7 +5039,8 @@ standard_80387_constant_p (rtx x)
/* For XFmode constants, try to find a special 80387 instruction when
optimizing for size or on those CPUs that benefit from them. */
if (mode == XFmode
- && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
+ && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS)
+ && !flag_rounding_math)
{
int i;
@@ -10703,24 +10708,19 @@ legitimate_pic_address_disp_p (rtx disp)
if (is_imported_p (op0))
return true;
- if (SYMBOL_REF_FAR_ADDR_P (op0)
- || !SYMBOL_REF_LOCAL_P (op0))
+ if (SYMBOL_REF_FAR_ADDR_P (op0) || !SYMBOL_REF_LOCAL_P (op0))
break;
- /* Function-symbols need to be resolved only for
- large-model.
- For the small-model we don't need to resolve anything
- here. */
+ /* Non-external-weak function symbols need to be resolved only
+ for the large model. Non-external symbols don't need to be
+ resolved for large and medium models. For the small model,
+ we don't need to resolve anything here. */
if ((ix86_cmodel != CM_LARGE_PIC
- && SYMBOL_REF_FUNCTION_P (op0))
+ && SYMBOL_REF_FUNCTION_P (op0)
+ && !(SYMBOL_REF_EXTERNAL_P (op0) && SYMBOL_REF_WEAK (op0)))
+ || !SYMBOL_REF_EXTERNAL_P (op0)
|| ix86_cmodel == CM_SMALL_PIC)
return true;
- /* Non-external symbols don't need to be resolved for
- large, and medium-model. */
- if ((ix86_cmodel == CM_LARGE_PIC
- || ix86_cmodel == CM_MEDIUM_PIC)
- && !SYMBOL_REF_EXTERNAL_P (op0))
- return true;
}
else if (!SYMBOL_REF_FAR_ADDR_P (op0)
&& (SYMBOL_REF_LOCAL_P (op0)
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 8a4251b..cba6d83 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1033,7 +1033,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
|| (MODE) == TImode)
#define VALID_AVX512FP16_REG_MODE(MODE) \
- ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode)
+ ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode \
+ || (MODE) == V2HFmode)
#define VALID_SSE2_REG_MODE(MODE) \
((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode \
@@ -1041,7 +1042,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
|| (MODE) == V2DImode || (MODE) == DFmode || (MODE) == HFmode)
#define VALID_SSE2_REG_VHF_MODE(MODE) \
- (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode)
+ (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode \
+ || (MODE) == V4HFmode || (MODE) == V2HFmode)
#define VALID_SSE_REG_MODE(MODE) \
((MODE) == V1TImode || (MODE) == TImode \
@@ -1051,10 +1053,12 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
#define VALID_MMX_REG_MODE_3DNOW(MODE) \
((MODE) == V2SFmode || (MODE) == SFmode)
+/* To match ia32 psABI, V4HFmode should be added here. */
#define VALID_MMX_REG_MODE(MODE) \
((MODE) == V1DImode || (MODE) == DImode \
|| (MODE) == V2SImode || (MODE) == SImode \
- || (MODE) == V4HImode || (MODE) == V8QImode)
+ || (MODE) == V4HImode || (MODE) == V8QImode \
+ || (MODE) == V4HFmode)
#define VALID_MASK_REG_MODE(MODE) ((MODE) == HImode || (MODE) == QImode)
@@ -1087,7 +1091,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
|| (MODE) == V4DImode || (MODE) == V8SFmode || (MODE) == V4DFmode \
|| (MODE) == V2TImode || (MODE) == V8DImode || (MODE) == V64QImode \
|| (MODE) == V16SImode || (MODE) == V32HImode || (MODE) == V8DFmode \
- || (MODE) == V16SFmode || VALID_AVX512FP16_REG_MODE (MODE))
+ || (MODE) == V16SFmode || (MODE) == V32HFmode || (MODE) == V16HFmode \
+ || (MODE) == V8HFmode)
#define X87_FLOAT_MODE_P(MODE) \
(TARGET_80387 && ((MODE) == SFmode || (MODE) == DFmode || (MODE) == XFmode))
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 188f431..04cb3bf 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -498,7 +498,7 @@
;; Main data type used by the insn
(define_attr "mode"
"unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,SF,DF,XF,TF,V32HF,V16HF,V8HF,
- V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF"
+ V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF,V4HF,V2HF"
(const_string "unknown"))
;; The CPU unit operations uses.
@@ -832,7 +832,7 @@
x64_avx,x64_avx512bw,x64_avx512dq,
sse_noavx,sse2,sse2_noavx,sse3,sse3_noavx,sse4,sse4_noavx,
avx,noavx,avx2,noavx2,bmi,bmi2,fma4,fma,avx512f,noavx512f,
- avx512bw,noavx512bw,avx512dq,noavx512dq,
+ avx512bw,noavx512bw,avx512dq,noavx512dq,fma_or_avx512vl,
avx512vl,noavx512vl,avxvnni,avx512vnnivl,avx512fp16"
(const_string "base"))
@@ -874,6 +874,8 @@
(eq_attr "isa" "bmi2") (symbol_ref "TARGET_BMI2")
(eq_attr "isa" "fma4") (symbol_ref "TARGET_FMA4")
(eq_attr "isa" "fma") (symbol_ref "TARGET_FMA")
+ (eq_attr "isa" "fma_or_avx512vl")
+ (symbol_ref "TARGET_FMA || TARGET_AVX512VL")
(eq_attr "isa" "avx512f") (symbol_ref "TARGET_AVX512F")
(eq_attr "isa" "noavx512f") (symbol_ref "!TARGET_AVX512F")
(eq_attr "isa" "avx512bw") (symbol_ref "TARGET_AVX512BW")
@@ -1104,7 +1106,8 @@
(V1TI "16") (V2TI "32") (V4TI "64")
(V2DF "16") (V4DF "32") (V8DF "64")
(V4SF "16") (V8SF "32") (V16SF "64")
- (V8HF "16") (V16HF "32") (V32HF "64")])
+ (V8HF "16") (V16HF "32") (V32HF "64")
+ (V4HF "8") (V2HF "4")])
;; Double word integer modes as mode attribute.
(define_mode_attr DWI [(QI "HI") (HI "SI") (SI "DI") (DI "TI") (TI "OI")])
@@ -1541,6 +1544,21 @@
DONE;
})
+(define_expand "cstorehf4"
+ [(set (reg:CC FLAGS_REG)
+ (compare:CC (match_operand:HF 2 "cmp_fp_expander_operand")
+ (match_operand:HF 3 "cmp_fp_expander_operand")))
+ (set (match_operand:QI 0 "register_operand")
+ (match_operator 1 "ix86_fp_comparison_operator"
+ [(reg:CC FLAGS_REG)
+ (const_int 0)]))]
+ "TARGET_AVX512FP16"
+{
+ ix86_expand_setcc (operands[0], GET_CODE (operands[1]),
+ operands[2], operands[3]);
+ DONE;
+})
+
(define_expand "cstore<mode>4"
[(set (reg:CC FLAGS_REG)
(compare:CC (match_operand:MODEF 2 "cmp_fp_expander_operand")
@@ -4793,6 +4811,16 @@
}
})
+(define_insn "fix<fixunssuffix>_trunchf<mode>2"
+ [(set (match_operand:SWI48 0 "register_operand" "=r")
+ (any_fix:SWI48
+ (match_operand:HF 1 "nonimmediate_operand" "vm")))]
+ "TARGET_AVX512FP16"
+ "vcvttsh2<fixsuffix>si\t{%1, %0|%0, %1}"
+ [(set_attr "type" "sseicvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<MODE>")])
+
;; Signed conversion to SImode.
(define_expand "fix_truncxfsi2"
@@ -4900,6 +4928,17 @@
(set_attr "prefix" "evex")
(set_attr "mode" "SI")])
+(define_insn "*fixuns_trunchfsi2zext"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (zero_extend:DI
+ (unsigned_fix:SI
+ (match_operand:HF 1 "nonimmediate_operand" "vm"))))]
+ "TARGET_64BIT && TARGET_AVX512FP16"
+ "vcvttsh2usi\t{%1, %k0|%k0, %1}"
+ [(set_attr "type" "sseicvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "SI")])
+
(define_insn "*fixuns_trunc<mode>si2_avx512f_zext"
[(set (match_operand:DI 0 "register_operand" "=r")
(zero_extend:DI
@@ -4932,6 +4971,14 @@
;; Without these patterns, we'll try the unsigned SI conversion which
;; is complex for SSE, rather than the signed SI conversion, which isn't.
+(define_expand "fixuns_trunchfhi2"
+ [(set (match_dup 2)
+ (fix:SI (match_operand:HF 1 "nonimmediate_operand")))
+ (set (match_operand:HI 0 "nonimmediate_operand")
+ (subreg:HI (match_dup 2) 0))]
+ "TARGET_AVX512FP16"
+ "operands[2] = gen_reg_rtx (SImode);")
+
(define_expand "fixuns_trunc<mode>hi2"
[(set (match_dup 2)
(fix:SI (match_operand:MODEF 1 "nonimmediate_operand")))
@@ -10163,6 +10210,40 @@
[(set_attr "type" "alu")
(set_attr "mode" "<MODE>")])
+;; convert (sign_extend:WIDE (any_logic:NARROW (memory, immediate)))
+;; to (any_logic:WIDE (sign_extend (memory)), (sign_extend (immediate))).
+;; This eliminates sign extension after logic operation.
+
+(define_split
+ [(set (match_operand:SWI248 0 "register_operand")
+ (sign_extend:SWI248
+ (any_logic:QI (match_operand:QI 1 "memory_operand")
+ (match_operand:QI 2 "const_int_operand"))))]
+ ""
+ [(set (match_dup 3) (sign_extend:SWI248 (match_dup 1)))
+ (set (match_dup 0) (any_logic:SWI248 (match_dup 3) (match_dup 2)))]
+ "operands[3] = gen_reg_rtx (<MODE>mode);")
+
+(define_split
+ [(set (match_operand:SWI48 0 "register_operand")
+ (sign_extend:SWI48
+ (any_logic:HI (match_operand:HI 1 "memory_operand")
+ (match_operand:HI 2 "const_int_operand"))))]
+ ""
+ [(set (match_dup 3) (sign_extend:SWI48 (match_dup 1)))
+ (set (match_dup 0) (any_logic:SWI48 (match_dup 3) (match_dup 2)))]
+ "operands[3] = gen_reg_rtx (<MODE>mode);")
+
+(define_split
+ [(set (match_operand:DI 0 "register_operand")
+ (sign_extend:DI
+ (any_logic:SI (match_operand:SI 1 "memory_operand")
+ (match_operand:SI 2 "const_int_operand"))))]
+ "TARGET_64BIT"
+ [(set (match_dup 3) (sign_extend:DI (match_dup 1)))
+ (set (match_dup 0) (any_logic:DI (match_dup 3) (match_dup 2)))]
+ "operands[3] = gen_reg_rtx (DImode);")
+
(define_insn "*<code><mode>_2"
[(set (reg FLAGS_REG)
(compare (any_or:SWI
@@ -17041,6 +17122,19 @@
DONE;
})
+(define_insn "sqrthf2"
+ [(set (match_operand:HF 0 "register_operand" "=v,v")
+ (sqrt:HF
+ (match_operand:HF 1 "nonimmediate_operand" "v,m")))]
+ "TARGET_AVX512FP16"
+ "@
+ vsqrtsh\t{%d1, %0|%0, %d1}
+ vsqrtsh\t{%1, %d0|%d0, %1}"
+ [(set_attr "type" "sse")
+ (set_attr "prefix" "evex")
+ (set_attr "avx_partial_xmm_update" "false,true")
+ (set_attr "mode" "HF")])
+
(define_insn "*sqrt<mode>2_sse"
[(set (match_operand:MODEF 0 "register_operand" "=v,v,v")
(sqrt:MODEF
@@ -18220,9 +18314,9 @@
(define_insn "sse4_1_round<mode>2"
- [(set (match_operand:MODEF 0 "register_operand" "=x,x,x,v,v")
- (unspec:MODEF
- [(match_operand:MODEF 1 "nonimmediate_operand" "0,x,m,v,m")
+ [(set (match_operand:MODEFH 0 "register_operand" "=x,x,x,v,v")
+ (unspec:MODEFH
+ [(match_operand:MODEFH 1 "nonimmediate_operand" "0,x,m,v,m")
(match_operand:SI 2 "const_0_to_15_operand" "n,n,n,n,n")]
UNSPEC_ROUND))]
"TARGET_SSE4_1"
@@ -18257,6 +18351,17 @@
(set_attr "znver1_decode" "vector")
(set_attr "mode" "XF")])
+(define_expand "rinthf2"
+ [(match_operand:HF 0 "register_operand")
+ (match_operand:HF 1 "nonimmediate_operand")]
+ "TARGET_AVX512FP16"
+{
+ emit_insn (gen_sse4_1_roundhf2 (operands[0],
+ operands[1],
+ GEN_INT (ROUND_MXCSR)));
+ DONE;
+})
+
(define_expand "rint<mode>2"
[(use (match_operand:MODEF 0 "register_operand"))
(use (match_operand:MODEF 1 "nonimmediate_operand"))]
@@ -18290,6 +18395,17 @@
"TARGET_USE_FANCY_MATH_387
&& !flag_trapping_math")
+(define_expand "nearbyinthf2"
+ [(match_operand:HF 0 "register_operand")
+ (match_operand:HF 1 "nonimmediate_operand")]
+ "TARGET_AVX512FP16"
+{
+ emit_insn (gen_sse4_1_roundhf2 (operands[0],
+ operands[1],
+ GEN_INT (ROUND_MXCSR | ROUND_NO_EXC)));
+ DONE;
+})
+
(define_expand "nearbyint<mode>2"
[(use (match_operand:MODEF 0 "register_operand"))
(use (match_operand:MODEF 1 "nonimmediate_operand"))]
@@ -18479,6 +18595,18 @@
"TARGET_USE_FANCY_MATH_387
&& (flag_fp_int_builtin_inexact || !flag_trapping_math)")
+(define_expand "<rounding_insn>hf2"
+ [(parallel [(set (match_operand:HF 0 "register_operand")
+ (unspec:HF [(match_operand:HF 1 "register_operand")]
+ FRNDINT_ROUNDING))
+ (clobber (reg:CC FLAGS_REG))])]
+ "TARGET_AVX512FP16"
+{
+ emit_insn (gen_sse4_1_roundhf2 (operands[0], operands[1],
+ GEN_INT (ROUND_<ROUNDING> | ROUND_NO_EXC)));
+ DONE;
+})
+
(define_expand "<rounding_insn><mode>2"
[(parallel [(set (match_operand:MODEF 0 "register_operand")
(unspec:MODEF [(match_operand:MODEF 1 "register_operand")]
@@ -19882,6 +20010,17 @@
(set_attr "type" "sseadd")
(set_attr "mode" "<MODE>")])
+(define_insn "<code>hf3"
+ [(set (match_operand:HF 0 "register_operand" "=v")
+ (smaxmin:HF
+ (match_operand:HF 1 "nonimmediate_operand" "%v")
+ (match_operand:HF 2 "nonimmediate_operand" "vm")))]
+ "TARGET_AVX512FP16"
+ "v<maxmin_float>sh\t{%2, %1, %0|%0, %1, %2}"
+ [(set_attr "prefix" "evex")
+ (set_attr "type" "sseadd")
+ (set_attr "mode" "HF")])
+
;; These versions of the min/max patterns implement exactly the operations
;; min = (op1 < op2 ? op1 : op2)
;; max = (!(op1 < op2) ? op1 : op2)
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 2d3b63f..c9467bc 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -48,7 +48,7 @@
(define_mode_iterator MMXMODEI8 [V8QI V4HI V2SI (V1DI "TARGET_SSE2")])
;; All 8-byte vector modes handled by MMX
-(define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF])
+(define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF V4HF])
(define_mode_iterator MMXMODE124 [V8QI V4HI V2SI V2SF])
;; Mix-n-match
@@ -57,8 +57,8 @@
(define_mode_iterator MMXMODE24 [V4HI V2SI])
(define_mode_iterator MMXMODE248 [V4HI V2SI V1DI])
-;; All 4-byte integer vector modes
-(define_mode_iterator V_32 [V4QI V2HI V1SI])
+;; All 4-byte integer/float16 vector modes
+(define_mode_iterator V_32 [V4QI V2HI V1SI V2HF])
;; 4-byte integer vector modes
(define_mode_iterator VI_32 [V4QI V2HI])
@@ -66,6 +66,9 @@
;; V2S* modes
(define_mode_iterator V2FI [V2SF V2SI])
+;; 4-byte and 8-byte float16 vector modes
+(define_mode_iterator VHF_32_64 [V4HF V2HF])
+
;; Mapping from integer vector mode to mnemonic suffix
(define_mode_attr mmxvecsize
[(V8QI "b") (V4QI "b") (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
@@ -191,6 +194,8 @@
(eq_attr "alternative" "11,12")
(cond [(match_test "<MODE>mode == V2SFmode")
(const_string "V4SF")
+ (match_test "<MODE>mode == V4HFmode")
+ (const_string "V4SF")
(ior (not (match_test "TARGET_SSE2"))
(match_test "optimize_function_for_size_p (cfun)"))
(const_string "V4SF")
@@ -198,14 +203,16 @@
(const_string "TI"))
(and (eq_attr "alternative" "13")
- (ior (and (match_test "<MODE>mode == V2SFmode")
- (not (match_test "TARGET_MMX_WITH_SSE")))
- (not (match_test "TARGET_SSE2"))))
+ (ior (ior (and (match_test "<MODE>mode == V2SFmode")
+ (not (match_test "TARGET_MMX_WITH_SSE")))
+ (not (match_test "TARGET_SSE2")))
+ (match_test "<MODE>mode == V4HFmode")))
(const_string "V2SF")
(and (eq_attr "alternative" "14")
- (ior (match_test "<MODE>mode == V2SFmode")
- (not (match_test "TARGET_SSE2"))))
+ (ior (ior (match_test "<MODE>mode == V2SFmode")
+ (not (match_test "TARGET_SSE2")))
+ (match_test "<MODE>mode == V4HFmode")))
(const_string "V2SF")
]
(const_string "DI")))
@@ -289,12 +296,17 @@
(const_string "*")))
(set (attr "mode")
(cond [(eq_attr "alternative" "2,3")
- (cond [(match_test "TARGET_AVX")
+ (cond [(match_test "<MODE>mode == V2HFmode")
+ (const_string "V4SF")
+ (match_test "TARGET_AVX")
(const_string "TI")
(match_test "optimize_function_for_size_p (cfun)")
(const_string "V4SF")
]
(const_string "TI"))
+ (and (eq_attr "alternative" "4,5")
+ (match_test "<MODE>mode == V2HFmode"))
+ (const_string "SF")
]
(const_string "SI")))
(set (attr "preferred_for_speed")
@@ -1019,12 +1031,13 @@
(match_operand:V2SF 1 "register_operand" "%0,v,x")
(match_operand:V2SF 2 "register_operand" "v,v,x")
(match_operand:V2SF 3 "register_operand" "v,0,x")))]
- "(TARGET_FMA || TARGET_FMA4) && TARGET_MMX_WITH_SSE"
+ "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL)
+ && TARGET_MMX_WITH_SSE"
"@
vfmadd132ps\t{%2, %3, %0|%0, %3, %2}
vfmadd231ps\t{%2, %1, %0|%0, %1, %2}
vfmaddps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
- [(set_attr "isa" "fma,fma,fma4")
+ [(set_attr "isa" "fma_or_avx512vl,fma_or_avx512vl,fma4")
(set_attr "type" "ssemuladd")
(set_attr "mode" "V4SF")])
@@ -1035,12 +1048,13 @@
(match_operand:V2SF 2 "register_operand" "v,v,x")
(neg:V2SF
(match_operand:V2SF 3 "register_operand" "v,0,x"))))]
- "(TARGET_FMA || TARGET_FMA4) && TARGET_MMX_WITH_SSE"
+ "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL)
+ && TARGET_MMX_WITH_SSE"
"@
vfmsub132ps\t{%2, %3, %0|%0, %3, %2}
vfmsub231ps\t{%2, %1, %0|%0, %1, %2}
vfmsubps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
- [(set_attr "isa" "fma,fma,fma4")
+ [(set_attr "isa" "fma_or_avx512vl,fma_or_avx512vl,fma4")
(set_attr "type" "ssemuladd")
(set_attr "mode" "V4SF")])
@@ -1051,12 +1065,13 @@
(match_operand:V2SF 1 "register_operand" "%0,v,x"))
(match_operand:V2SF 2 "register_operand" "v,v,x")
(match_operand:V2SF 3 "register_operand" "v,0,x")))]
- "(TARGET_FMA || TARGET_FMA4) && TARGET_MMX_WITH_SSE"
+ "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL)
+ && TARGET_MMX_WITH_SSE"
"@
vfnmadd132ps\t{%2, %3, %0|%0, %3, %2}
vfnmadd231ps\t{%2, %1, %0|%0, %1, %2}
vfnmaddps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
- [(set_attr "isa" "fma,fma,fma4")
+ [(set_attr "isa" "fma_or_avx512vl,fma_or_avx512vl,fma4")
(set_attr "type" "ssemuladd")
(set_attr "mode" "V4SF")])
@@ -1068,12 +1083,13 @@
(match_operand:V2SF 2 "register_operand" "v,v,x")
(neg:V2SF
(match_operand:V2SF 3 "register_operand" "v,0,x"))))]
- "(TARGET_FMA || TARGET_FMA4) && TARGET_MMX_WITH_SSE"
+ "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL)
+ && TARGET_MMX_WITH_SSE"
"@
vfnmsub132ps\t{%2, %3, %0|%0, %3, %2}
vfnmsub231ps\t{%2, %1, %0|%0, %1, %2}
vfnmsubps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
- [(set_attr "isa" "fma,fma,fma4")
+ [(set_attr "isa" "fma_or_avx512vl,fma_or_avx512vl,fma4")
(set_attr "type" "ssemuladd")
(set_attr "mode" "V4SF")])
@@ -1389,6 +1405,28 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
+;; Parallel half-precision floating point arithmetic
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "<insn><mode>3"
+ [(set (match_operand:VHF_32_64 0 "register_operand" "=v")
+ (plusminusmultdiv:VHF_32_64
+ (match_operand:VHF_32_64 1 "register_operand" "<comm>v")
+ (match_operand:VHF_32_64 2 "register_operand" "v")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "v<insn>ph\t{%2, %1, %0|%0, %1, %2}"
+ [(set (attr "type")
+ (cond [(match_test "<CODE> == MULT")
+ (const_string "ssemul")
+ (match_test "<CODE> == DIV")
+ (const_string "ssediv")]
+ (const_string "sseadd")))
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "V8HF")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
;; Parallel integral arithmetic
;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 0016c02..4559b0c 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -191,6 +191,14 @@
UNSPEC_VCVTNE2PS2BF16
UNSPEC_VCVTNEPS2BF16
UNSPEC_VDPBF16PS
+
+ ;; For AVX512FP16 suppport
+ UNSPEC_COMPLEX_FMA
+ UNSPEC_COMPLEX_FCMA
+ UNSPEC_COMPLEX_FMUL
+ UNSPEC_COMPLEX_FCMUL
+ UNSPEC_COMPLEX_MASK
+
])
(define_c_enum "unspecv" [
@@ -939,6 +947,10 @@
(V16SF "HI") (V8SF "QI") (V4SF "QI")
(V8DF "QI") (V4DF "QI") (V2DF "QI")])
+;; Mapping of vector modes to corresponding complex mask size
+(define_mode_attr avx512fmaskcmode
+ [(V32HF "HI") (V16HF "QI") (V8HF "QI")])
+
;; Mapping of vector modes to corresponding mask size
(define_mode_attr avx512fmaskmodelower
[(V64QI "di") (V32QI "si") (V16QI "hi")
@@ -977,9 +989,9 @@
(V16HF "OI") (V8HF "TI")])
(define_mode_attr sseintvecmodelower
- [(V16SF "v16si") (V8DF "v8di")
- (V8SF "v8si") (V4DF "v4di")
- (V4SF "v4si") (V2DF "v2di")
+ [(V32HF "v32hi") (V16SF "v16si") (V8DF "v8di")
+ (V16HF "v16hi") (V8SF "v8si") (V4DF "v4di")
+ (V8HF "v8hi") (V4SF "v4si") (V2DF "v2di")
(V8SI "v8si") (V4DI "v4di")
(V4SI "v4si") (V2DI "v2di")
(V16HI "v16hi") (V8HI "v8hi")
@@ -1022,6 +1034,13 @@
(V8DI "V8HF") (V4DI "V8HF") (V2DI "V8HF")
(V8DF "V8HF") (V16SF "V16HF") (V8SF "V8HF")])
+;; Mapping of vector modes to vector hf modes of same element.
+(define_mode_attr ssePHmodelower
+ [(V32HI "v32hf") (V16HI "v16hf") (V8HI "v8hf")
+ (V16SI "v16hf") (V8SI "v8hf") (V4SI "v4hf")
+ (V8DI "v8hf") (V4DI "v4hf") (V2DI "v2hf")
+ (V8DF "v8hf") (V16SF "v16hf") (V8SF "v8hf")])
+
;; Mapping of vector modes to packed single mode of the same size
(define_mode_attr ssePSmode
[(V16SI "V16SF") (V8DF "V16SF")
@@ -1549,9 +1568,9 @@
(set_attr "mode" "<sseinsnmode>")])
(define_insn "<avx512>_store<mode>_mask"
- [(set (match_operand:VI12_AVX512VL 0 "memory_operand" "=m")
- (vec_merge:VI12_AVX512VL
- (match_operand:VI12_AVX512VL 1 "register_operand" "v")
+ [(set (match_operand:VI12HF_AVX512VL 0 "memory_operand" "=m")
+ (vec_merge:VI12HF_AVX512VL
+ (match_operand:VI12HF_AVX512VL 1 "register_operand" "v")
(match_dup 0)
(match_operand:<avx512fmaskmode> 2 "register_operand" "Yk")))]
"TARGET_AVX512BW"
@@ -2106,12 +2125,12 @@
[(set_attr "isa" "noavx,noavx,avx,avx")])
(define_expand "cond_<insn><mode>"
- [(set (match_operand:VF 0 "register_operand")
- (vec_merge:VF
- (plusminus:VF
- (match_operand:VF 2 "vector_operand")
- (match_operand:VF 3 "vector_operand"))
- (match_operand:VF 4 "nonimm_or_0_operand")
+ [(set (match_operand:VFH 0 "register_operand")
+ (vec_merge:VFH
+ (plusminus:VFH
+ (match_operand:VFH 2 "vector_operand")
+ (match_operand:VFH 3 "vector_operand"))
+ (match_operand:VFH 4 "nonimm_or_0_operand")
(match_operand:<avx512fmaskmode> 1 "register_operand")))]
"<MODE_SIZE> == 64 || TARGET_AVX512VL"
{
@@ -2195,12 +2214,12 @@
(set_attr "mode" "<ssescalarmode>")])
(define_expand "cond_mul<mode>"
- [(set (match_operand:VF 0 "register_operand")
- (vec_merge:VF
- (mult:VF
- (match_operand:VF 2 "vector_operand")
- (match_operand:VF 3 "vector_operand"))
- (match_operand:VF 4 "nonimm_or_0_operand")
+ [(set (match_operand:VFH 0 "register_operand")
+ (vec_merge:VFH
+ (mult:VFH
+ (match_operand:VFH 2 "vector_operand")
+ (match_operand:VFH 3 "vector_operand"))
+ (match_operand:VFH 4 "nonimm_or_0_operand")
(match_operand:<avx512fmaskmode> 1 "register_operand")))]
"<MODE_SIZE> == 64 || TARGET_AVX512VL"
{
@@ -2310,12 +2329,12 @@
})
(define_expand "cond_div<mode>"
- [(set (match_operand:VF 0 "register_operand")
- (vec_merge:VF
- (div:VF
- (match_operand:VF 2 "register_operand")
- (match_operand:VF 3 "vector_operand"))
- (match_operand:VF 4 "nonimm_or_0_operand")
+ [(set (match_operand:VFH 0 "register_operand")
+ (vec_merge:VFH
+ (div:VFH
+ (match_operand:VFH 2 "register_operand")
+ (match_operand:VFH 3 "vector_operand"))
+ (match_operand:VFH 4 "nonimm_or_0_operand")
(match_operand:<avx512fmaskmode> 1 "register_operand")))]
"<MODE_SIZE> == 64 || TARGET_AVX512VL"
{
@@ -2510,12 +2529,12 @@
(set_attr "mode" "<ssescalarmode>")])
(define_insn "*<sse>_vmsqrt<mode>2<mask_scalar_name><round_scalar_name>"
- [(set (match_operand:VF_128 0 "register_operand" "=x,v")
- (vec_merge:VF_128
- (vec_duplicate:VF_128
+ [(set (match_operand:VFH_128 0 "register_operand" "=x,v")
+ (vec_merge:VFH_128
+ (vec_duplicate:VFH_128
(sqrt:<ssescalarmode>
(match_operand:<ssescalarmode> 1 "nonimmediate_operand" "xm,<round_scalar_constraint>")))
- (match_operand:VF_128 2 "register_operand" "0,v")
+ (match_operand:VFH_128 2 "register_operand" "0,v")
(const_int 1)))]
"TARGET_SSE"
"@
@@ -2648,12 +2667,12 @@
(set_attr "mode" "HF")])
(define_expand "cond_<code><mode>"
- [(set (match_operand:VF 0 "register_operand")
- (vec_merge:VF
- (smaxmin:VF
- (match_operand:VF 2 "vector_operand")
- (match_operand:VF 3 "vector_operand"))
- (match_operand:VF 4 "nonimm_or_0_operand")
+ [(set (match_operand:VFH 0 "register_operand")
+ (vec_merge:VFH
+ (smaxmin:VFH
+ (match_operand:VFH 2 "vector_operand")
+ (match_operand:VFH 3 "vector_operand"))
+ (match_operand:VFH 4 "nonimm_or_0_operand")
(match_operand:<avx512fmaskmode> 1 "register_operand")))]
"<MODE_SIZE> == 64 || TARGET_AVX512VL"
{
@@ -3137,36 +3156,20 @@
(set_attr "prefix_rep" "1,*")
(set_attr "mode" "V4SF")])
-(define_expand "reduc_plus_scal_v4sf"
- [(plus:V4SF
- (match_operand:SF 0 "register_operand")
- (match_operand:V4SF 1 "register_operand"))]
- "TARGET_SSE"
-{
- rtx vtmp = gen_reg_rtx (V4SFmode);
- rtx stmp = gen_reg_rtx (SFmode);
- if (TARGET_SSE3)
- emit_insn (gen_sse3_movshdup (vtmp, operands[1]));
- else
- emit_insn (gen_sse_shufps (vtmp, operands[1], operands[1], GEN_INT(177)));
-
- emit_insn (gen_addv4sf3 (operands[1], operands[1], vtmp));
- emit_insn (gen_sse_movhlps (vtmp, vtmp, operands[1]));
- emit_insn (gen_vec_extractv4sfsf (stmp, vtmp, const0_rtx));
- emit_insn (gen_vec_extractv4sfsf (operands[0], operands[1], const0_rtx));
- emit_insn (gen_addsf3 (operands[0], operands[0], stmp));
- DONE;
-})
+(define_mode_iterator REDUC_SSE_PLUS_MODE
+ [(V2DF "TARGET_SSE") (V4SF "TARGET_SSE")
+ (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")])
-(define_expand "reduc_plus_scal_v2df"
- [(plus:V2DF
- (match_operand:DF 0 "register_operand")
- (match_operand:V2DF 1 "register_operand"))]
- "TARGET_SSE"
+(define_expand "reduc_plus_scal_<mode>"
+ [(plus:REDUC_SSE_PLUS_MODE
+ (match_operand:<ssescalarmode> 0 "register_operand")
+ (match_operand:REDUC_SSE_PLUS_MODE 1 "register_operand"))]
+ ""
{
- rtx tmp = gen_reg_rtx (V2DFmode);
- ix86_expand_reduc (gen_addv2df3, tmp, operands[1]);
- emit_insn (gen_vec_extractv2dfdf (operands[0], tmp, const0_rtx));
+ rtx tmp = gen_reg_rtx (<MODE>mode);
+ ix86_expand_reduc (gen_add<mode>3, tmp, operands[1]);
+ emit_insn (gen_vec_extract<mode><ssescalarmodelower> (operands[0], tmp,
+ const0_rtx));
DONE;
})
@@ -3192,7 +3195,9 @@
(define_mode_iterator REDUC_PLUS_MODE
[(V4DF "TARGET_AVX") (V8SF "TARGET_AVX")
+ (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V8DF "TARGET_AVX512F") (V16SF "TARGET_AVX512F")
+ (V32HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V32QI "TARGET_AVX") (V64QI "TARGET_AVX512F")])
(define_expand "reduc_plus_scal_<mode>"
@@ -3212,7 +3217,8 @@
;; Modes handled by reduc_sm{in,ax}* patterns.
(define_mode_iterator REDUC_SSE_SMINMAX_MODE
- [(V4SF "TARGET_SSE") (V2DF "TARGET_SSE")
+ [(V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+ (V4SF "TARGET_SSE") (V2DF "TARGET_SSE")
(V4SI "TARGET_SSE2") (V8HI "TARGET_SSE2") (V16QI "TARGET_SSE2")
(V2DI "TARGET_SSE4_2")])
@@ -3231,9 +3237,11 @@
(define_mode_iterator REDUC_SMINMAX_MODE
[(V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")
+ (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")
(V8SF "TARGET_AVX") (V4DF "TARGET_AVX")
(V64QI "TARGET_AVX512BW")
+ (V32HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V32HI "TARGET_AVX512BW") (V16SI "TARGET_AVX512F")
(V8DI "TARGET_AVX512F") (V16SF "TARGET_AVX512F")
(V8DF "TARGET_AVX512F")])
@@ -3791,8 +3799,8 @@
(define_expand "vec_cmp<mode><avx512fmaskmodelower>"
[(set (match_operand:<avx512fmaskmode> 0 "register_operand")
(match_operator:<avx512fmaskmode> 1 ""
- [(match_operand:V48_AVX512VL 2 "register_operand")
- (match_operand:V48_AVX512VL 3 "nonimmediate_operand")]))]
+ [(match_operand:V48H_AVX512VL 2 "register_operand")
+ (match_operand:V48H_AVX512VL 3 "nonimmediate_operand")]))]
"TARGET_AVX512F"
{
bool ok = ix86_expand_mask_vec_cmp (operands[0], GET_CODE (operands[1]),
@@ -3999,6 +4007,51 @@
DONE;
})
+(define_expand "vcond<mode><mode>"
+ [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
+ (if_then_else:VF_AVX512FP16VL
+ (match_operator 3 ""
+ [(match_operand:VF_AVX512FP16VL 4 "vector_operand")
+ (match_operand:VF_AVX512FP16VL 5 "vector_operand")])
+ (match_operand:VF_AVX512FP16VL 1 "general_operand")
+ (match_operand:VF_AVX512FP16VL 2 "general_operand")))]
+ "TARGET_AVX512FP16"
+{
+ bool ok = ix86_expand_fp_vcond (operands);
+ gcc_assert (ok);
+ DONE;
+})
+
+(define_expand "vcond<mode><sseintvecmodelower>"
+ [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
+ (if_then_else:VF_AVX512FP16VL
+ (match_operator 3 ""
+ [(match_operand:<sseintvecmode> 4 "vector_operand")
+ (match_operand:<sseintvecmode> 5 "vector_operand")])
+ (match_operand:VF_AVX512FP16VL 1 "general_operand")
+ (match_operand:VF_AVX512FP16VL 2 "general_operand")))]
+ "TARGET_AVX512FP16"
+{
+ bool ok = ix86_expand_int_vcond (operands);
+ gcc_assert (ok);
+ DONE;
+})
+
+(define_expand "vcond<sseintvecmodelower><mode>"
+ [(set (match_operand:<sseintvecmode> 0 "register_operand")
+ (if_then_else:<sseintvecmode>
+ (match_operator 3 ""
+ [(match_operand:VF_AVX512FP16VL 4 "vector_operand")
+ (match_operand:VF_AVX512FP16VL 5 "vector_operand")])
+ (match_operand:<sseintvecmode> 1 "general_operand")
+ (match_operand:<sseintvecmode> 2 "general_operand")))]
+ "TARGET_AVX512FP16"
+{
+ bool ok = ix86_expand_fp_vcond (operands);
+ gcc_assert (ok);
+ DONE;
+})
+
(define_expand "vcond_mask_<mode><avx512fmaskmodelower>"
[(set (match_operand:V48_AVX512VL 0 "register_operand")
(vec_merge:V48_AVX512VL
@@ -4008,10 +4061,10 @@
"TARGET_AVX512F")
(define_expand "vcond_mask_<mode><avx512fmaskmodelower>"
- [(set (match_operand:VI12_AVX512VL 0 "register_operand")
- (vec_merge:VI12_AVX512VL
- (match_operand:VI12_AVX512VL 1 "nonimmediate_operand")
- (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand")
+ [(set (match_operand:VI12HF_AVX512VL 0 "register_operand")
+ (vec_merge:VI12HF_AVX512VL
+ (match_operand:VI12HF_AVX512VL 1 "nonimmediate_operand")
+ (match_operand:VI12HF_AVX512VL 2 "nonimm_or_0_operand")
(match_operand:<avx512fmaskmode> 3 "register_operand")))]
"TARGET_AVX512BW")
@@ -4638,7 +4691,11 @@
(V8SF "TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL")
(V4DF "TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL")
(V16SF "TARGET_AVX512F")
- (V8DF "TARGET_AVX512F")])
+ (V8DF "TARGET_AVX512F")
+ (HF "TARGET_AVX512FP16")
+ (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+ (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+ (V32HF "TARGET_AVX512FP16")])
(define_expand "fma<mode>4"
[(set (match_operand:FMAMODEM 0 "register_operand")
@@ -4746,14 +4803,11 @@
(set_attr "mode" "<MODE>")])
;; Suppose AVX-512F as baseline
-(define_mode_iterator VF_SF_AVX512VL
- [SF V16SF (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")
- DF V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")])
-
(define_mode_iterator VFH_SF_AVX512VL
[(V32HF "TARGET_AVX512FP16")
(V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+ (HF "TARGET_AVX512FP16")
SF V16SF (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")
DF V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")])
@@ -4772,13 +4826,13 @@
(set_attr "mode" "<MODE>")])
(define_expand "cond_fma<mode>"
- [(set (match_operand:VF_AVX512VL 0 "register_operand")
- (vec_merge:VF_AVX512VL
- (fma:VF_AVX512VL
- (match_operand:VF_AVX512VL 2 "vector_operand")
- (match_operand:VF_AVX512VL 3 "vector_operand")
- (match_operand:VF_AVX512VL 4 "vector_operand"))
- (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand")
+ [(set (match_operand:VFH_AVX512VL 0 "register_operand")
+ (vec_merge:VFH_AVX512VL
+ (fma:VFH_AVX512VL
+ (match_operand:VFH_AVX512VL 2 "vector_operand")
+ (match_operand:VFH_AVX512VL 3 "vector_operand")
+ (match_operand:VFH_AVX512VL 4 "vector_operand"))
+ (match_operand:VFH_AVX512VL 5 "nonimm_or_0_operand")
(match_operand:<avx512fmaskmode> 1 "register_operand")))]
"TARGET_AVX512F"
{
@@ -4872,14 +4926,14 @@
(set_attr "mode" "<MODE>")])
(define_expand "cond_fms<mode>"
- [(set (match_operand:VF_AVX512VL 0 "register_operand")
- (vec_merge:VF_AVX512VL
- (fma:VF_AVX512VL
- (match_operand:VF_AVX512VL 2 "vector_operand")
- (match_operand:VF_AVX512VL 3 "vector_operand")
- (neg:VF_AVX512VL
- (match_operand:VF_AVX512VL 4 "vector_operand")))
- (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand")
+ [(set (match_operand:VFH_AVX512VL 0 "register_operand")
+ (vec_merge:VFH_AVX512VL
+ (fma:VFH_AVX512VL
+ (match_operand:VFH_AVX512VL 2 "vector_operand")
+ (match_operand:VFH_AVX512VL 3 "vector_operand")
+ (neg:VFH_AVX512VL
+ (match_operand:VFH_AVX512VL 4 "vector_operand")))
+ (match_operand:VFH_AVX512VL 5 "nonimm_or_0_operand")
(match_operand:<avx512fmaskmode> 1 "register_operand")))]
"TARGET_AVX512F"
{
@@ -4975,14 +5029,14 @@
(set_attr "mode" "<MODE>")])
(define_expand "cond_fnma<mode>"
- [(set (match_operand:VF_AVX512VL 0 "register_operand")
- (vec_merge:VF_AVX512VL
- (fma:VF_AVX512VL
- (neg:VF_AVX512VL
- (match_operand:VF_AVX512VL 2 "vector_operand"))
- (match_operand:VF_AVX512VL 3 "vector_operand")
- (match_operand:VF_AVX512VL 4 "vector_operand"))
- (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand")
+ [(set (match_operand:VFH_AVX512VL 0 "register_operand")
+ (vec_merge:VFH_AVX512VL
+ (fma:VFH_AVX512VL
+ (neg:VFH_AVX512VL
+ (match_operand:VFH_AVX512VL 2 "vector_operand"))
+ (match_operand:VFH_AVX512VL 3 "vector_operand")
+ (match_operand:VFH_AVX512VL 4 "vector_operand"))
+ (match_operand:VFH_AVX512VL 5 "nonimm_or_0_operand")
(match_operand:<avx512fmaskmode> 1 "register_operand")))]
"TARGET_AVX512F"
{
@@ -5080,15 +5134,15 @@
(set_attr "mode" "<MODE>")])
(define_expand "cond_fnms<mode>"
- [(set (match_operand:VF_AVX512VL 0 "register_operand")
- (vec_merge:VF_AVX512VL
- (fma:VF_AVX512VL
- (neg:VF_AVX512VL
- (match_operand:VF_AVX512VL 2 "vector_operand"))
- (match_operand:VF_AVX512VL 3 "vector_operand")
- (neg:VF_AVX512VL
- (match_operand:VF_AVX512VL 4 "vector_operand")))
- (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand")
+ [(set (match_operand:VFH_AVX512VL 0 "register_operand")
+ (vec_merge:VFH_AVX512VL
+ (fma:VFH_AVX512VL
+ (neg:VFH_AVX512VL
+ (match_operand:VFH_AVX512VL 2 "vector_operand"))
+ (match_operand:VFH_AVX512VL 3 "vector_operand")
+ (neg:VFH_AVX512VL
+ (match_operand:VFH_AVX512VL 4 "vector_operand")))
+ (match_operand:VFH_AVX512VL 5 "nonimm_or_0_operand")
(match_operand:<avx512fmaskmode> 1 "register_operand")))]
"TARGET_AVX512F"
{
@@ -5793,6 +5847,168 @@
[(set_attr "type" "ssemuladd")
(set_attr "mode" "<MODE>")])
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Complex type operations
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_int_iterator UNSPEC_COMPLEX_F_C_MA
+ [UNSPEC_COMPLEX_FMA UNSPEC_COMPLEX_FCMA])
+
+(define_int_iterator UNSPEC_COMPLEX_F_C_MUL
+ [UNSPEC_COMPLEX_FMUL UNSPEC_COMPLEX_FCMUL])
+
+(define_int_attr complexopname
+ [(UNSPEC_COMPLEX_FMA "fmaddc")
+ (UNSPEC_COMPLEX_FCMA "fcmaddc")
+ (UNSPEC_COMPLEX_FMUL "fmulc")
+ (UNSPEC_COMPLEX_FCMUL "fcmulc")])
+
+(define_expand "<avx512>_fmaddc_<mode>_maskz<round_expand_name>"
+ [(match_operand:VF_AVX512FP16VL 0 "register_operand")
+ (match_operand:VF_AVX512FP16VL 1 "<round_expand_nimm_predicate>")
+ (match_operand:VF_AVX512FP16VL 2 "<round_expand_nimm_predicate>")
+ (match_operand:VF_AVX512FP16VL 3 "<round_expand_nimm_predicate>")
+ (match_operand:<avx512fmaskcmode> 4 "register_operand")]
+ "TARGET_AVX512FP16 && <round_mode512bit_condition>"
+{
+ emit_insn (gen_fma_fmaddc_<mode>_maskz_1<round_expand_name> (
+ operands[0], operands[1], operands[2], operands[3],
+ CONST0_RTX (<MODE>mode), operands[4]<round_expand_operand>));
+ DONE;
+})
+
+(define_expand "<avx512>_fcmaddc_<mode>_maskz<round_expand_name>"
+ [(match_operand:VF_AVX512FP16VL 0 "register_operand")
+ (match_operand:VF_AVX512FP16VL 1 "<round_expand_nimm_predicate>")
+ (match_operand:VF_AVX512FP16VL 2 "<round_expand_nimm_predicate>")
+ (match_operand:VF_AVX512FP16VL 3 "<round_expand_nimm_predicate>")
+ (match_operand:<avx512fmaskcmode> 4 "register_operand")]
+ "TARGET_AVX512FP16 && <round_mode512bit_condition>"
+{
+ emit_insn (gen_fma_fcmaddc_<mode>_maskz_1<round_expand_name> (
+ operands[0], operands[1], operands[2], operands[3],
+ CONST0_RTX (<MODE>mode), operands[4]<round_expand_operand>));
+ DONE;
+})
+
+(define_insn "fma_<complexopname>_<mode><sdc_maskz_name><round_name>"
+ [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
+ (unspec:VF_AVX512FP16VL
+ [(match_operand:VF_AVX512FP16VL 1 "<round_nimm_predicate>" "%v")
+ (match_operand:VF_AVX512FP16VL 2 "<round_nimm_predicate>" "<round_constraint>")
+ (match_operand:VF_AVX512FP16VL 3 "<round_nimm_predicate>" "0")]
+ UNSPEC_COMPLEX_F_C_MA))]
+ "TARGET_AVX512FP16 && <sdc_mask_mode512bit_condition> && <round_mode512bit_condition>"
+ "v<complexopname><ssemodesuffix>\t{<round_sdc_mask_op4>%2, %1, %0<sdc_mask_op4>|%0<sdc_mask_op4>, %1, %2<round_sdc_mask_op4>}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<MODE>")])
+
+(define_insn "<avx512>_<complexopname>_<mode>_mask<round_name>"
+ [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
+ (vec_merge:VF_AVX512FP16VL
+ (unspec:VF_AVX512FP16VL
+ [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "%v")
+ (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>")
+ (match_operand:VF_AVX512FP16VL 3 "register_operand" "0")]
+ UNSPEC_COMPLEX_F_C_MA)
+ (match_dup 1)
+ (unspec:<avx512fmaskmode>
+ [(match_operand:<avx512fmaskcmode> 4 "register_operand" "Yk")]
+ UNSPEC_COMPLEX_MASK)))]
+ "TARGET_AVX512FP16 && <round_mode512bit_condition>"
+ "v<complexopname><ssemodesuffix>\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %1, %2<round_op5>}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "<MODE>")])
+
+(define_insn "<avx512>_<complexopname>_<mode><maskc_name><round_name>"
+ [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
+ (unspec:VF_AVX512FP16VL
+ [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "%v")
+ (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>")]
+ UNSPEC_COMPLEX_F_C_MUL))]
+ "TARGET_AVX512FP16 && <round_mode512bit_condition>"
+ "v<complexopname><ssemodesuffix>\t{<round_maskc_op3>%2, %1, %0<maskc_operand3>|%0<maskc_operand3>, %1, %2<round_maskc_op3>}"
+ [(set_attr "type" "ssemul")
+ (set_attr "mode" "<MODE>")])
+
+(define_expand "avx512fp16_fmaddcsh_v8hf_maskz<round_expand_name>"
+ [(match_operand:V8HF 0 "register_operand")
+ (match_operand:V8HF 1 "<round_expand_nimm_predicate>")
+ (match_operand:V8HF 2 "<round_expand_nimm_predicate>")
+ (match_operand:V8HF 3 "<round_expand_nimm_predicate>")
+ (match_operand:QI 4 "register_operand")]
+ "TARGET_AVX512FP16 && <round_mode512bit_condition>"
+{
+ emit_insn (gen_avx512fp16_fma_fmaddcsh_v8hf_maskz<round_expand_name> (
+ operands[0], operands[1], operands[2], operands[3],
+ CONST0_RTX (V8HFmode), operands[4]<round_expand_operand>));
+ DONE;
+})
+
+(define_expand "avx512fp16_fcmaddcsh_v8hf_maskz<round_expand_name>"
+ [(match_operand:V8HF 0 "register_operand")
+ (match_operand:V8HF 1 "<round_expand_nimm_predicate>")
+ (match_operand:V8HF 2 "<round_expand_nimm_predicate>")
+ (match_operand:V8HF 3 "<round_expand_nimm_predicate>")
+ (match_operand:QI 4 "register_operand")]
+ "TARGET_AVX512FP16 && <round_mode512bit_condition>"
+{
+ emit_insn (gen_avx512fp16_fma_fcmaddcsh_v8hf_maskz<round_expand_name> (
+ operands[0], operands[1], operands[2], operands[3],
+ CONST0_RTX (V8HFmode), operands[4]<round_expand_operand>));
+ DONE;
+})
+
+(define_insn "avx512fp16_fma_<complexopname>sh_v8hf<mask_scalarcz_name><round_scalarcz_name>"
+ [(set (match_operand:V8HF 0 "register_operand" "=&v")
+ (vec_merge:V8HF
+ (unspec:V8HF
+ [(match_operand:V8HF 1 "<round_scalarcz_nimm_predicate>" "v")
+ (match_operand:V8HF 2 "<round_scalarcz_nimm_predicate>" "<round_scalarcz_constraint>")
+ (match_operand:V8HF 3 "<round_scalarcz_nimm_predicate>" "0")]
+ UNSPEC_COMPLEX_F_C_MA)
+ (match_dup 2)
+ (const_int 3)))]
+ "TARGET_AVX512FP16"
+ "v<complexopname>sh\t{<round_scalarcz_mask_op4>%2, %1, %0<mask_scalarcz_operand4>|%0<mask_scalarcz_operand4>, %1, %2<round_scalarcz_maskcz_mask_op4>}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "V8HF")])
+
+(define_insn "avx512fp16_<complexopname>sh_v8hf_mask<round_name>"
+ [(set (match_operand:V8HF 0 "register_operand" "=&v")
+ (vec_merge:V8HF
+ (vec_merge:V8HF
+ (unspec:V8HF
+ [(match_operand:V8HF 1 "<round_nimm_predicate>" "v")
+ (match_operand:V8HF 2 "<round_nimm_predicate>" "<round_constraint>")
+ (match_operand:V8HF 3 "<round_nimm_predicate>" "0")]
+ UNSPEC_COMPLEX_F_C_MA)
+ (match_dup 1)
+ (unspec:QI [(match_operand:QI 4 "register_operand" "Yk")]
+ UNSPEC_COMPLEX_MASK))
+ (match_dup 2)
+ (const_int 3)))]
+ "TARGET_AVX512FP16"
+ "v<complexopname>sh\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %1, %2<round_op5>}"
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "V8HF")])
+
+(define_insn "avx512fp16_<complexopname>sh_v8hf<mask_scalarc_name><round_scalarcz_name>"
+ [(set (match_operand:V8HF 0 "register_operand" "=&v")
+ (vec_merge:V8HF
+ (unspec:V8HF
+ [(match_operand:V8HF 1 "nonimmediate_operand" "v")
+ (match_operand:V8HF 2 "<round_scalarcz_nimm_predicate>" "<round_scalarcz_constraint>")]
+ UNSPEC_COMPLEX_F_C_MUL)
+ (match_dup 1)
+ (const_int 3)))]
+ "TARGET_AVX512FP16"
+ "v<complexopname>sh\t{<round_scalarc_mask_op3>%2, %1, %0<mask_scalarc_operand3>|%0<mask_scalarc_operand3>, %1, %2<round_scalarc_mask_op3>}"
+ [(set_attr "type" "ssemul")
+ (set_attr "mode" "V8HF")])
+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;; Parallel half-precision floating point conversion operations
@@ -5824,6 +6040,12 @@
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
+(define_expand "float<floatunssuffix><mode><ssePHmodelower>2"
+ [(set (match_operand:<ssePHmode> 0 "register_operand")
+ (any_float:<ssePHmode>
+ (match_operand:VI2H_AVX512VL 1 "nonimmediate_operand")))]
+ "TARGET_AVX512FP16")
+
(define_insn "avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode><mask_name><round_name>"
[(set (match_operand:<ssePHmode> 0 "register_operand" "=v")
(any_float:<ssePHmode>
@@ -5834,11 +6056,23 @@
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
-(define_expand "avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode>"
- [(set (match_operand:V8HF 0 "register_operand" "=v")
+(define_expand "float<floatunssuffix><mode>v4hf2"
+ [(set (match_operand:V4HF 0 "register_operand")
+ (any_float:V4HF
+ (match_operand:VI4_128_8_256 1 "vector_operand")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+ operands[0] = lowpart_subreg (V8HFmode, operands[0], V4HFmode);
+ emit_insn (gen_avx512fp16_float<floatunssuffix><mode>v4hf2 (operands[0],
+ operands[1]));
+ DONE;
+})
+
+(define_expand "avx512fp16_float<floatunssuffix><mode>v4hf2"
+ [(set (match_operand:V8HF 0 "register_operand")
(vec_concat:V8HF
- (any_float:V4HF (match_operand:VI4_128_8_256 1 "vector_operand" "vm"))
- (match_dup 2)))]
+ (any_float:V4HF (match_operand:VI4_128_8_256 1 "vector_operand"))
+ (match_dup 2)))]
"TARGET_AVX512FP16 && TARGET_AVX512VL"
"operands[2] = CONST0_RTX (V4HFmode);")
@@ -5897,11 +6131,23 @@
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
-(define_expand "avx512fp16_vcvt<floatsuffix>qq2ph_v2di"
- [(set (match_operand:V8HF 0 "register_operand" "=v")
+(define_expand "float<floatunssuffix>v2div2hf2"
+ [(set (match_operand:V2HF 0 "register_operand")
+ (any_float:V2HF
+ (match_operand:V2DI 1 "vector_operand")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+ operands[0] = lowpart_subreg (V8HFmode, operands[0], V2HFmode);
+ emit_insn (gen_avx512fp16_float<floatunssuffix>v2div2hf2 (operands[0],
+ operands[1]));
+ DONE;
+})
+
+(define_expand "avx512fp16_float<floatunssuffix>v2div2hf2"
+ [(set (match_operand:V8HF 0 "register_operand")
(vec_concat:V8HF
- (any_float:V2HF (match_operand:V2DI 1 "vector_operand" "vm"))
- (match_dup 2)))]
+ (any_float:V2HF (match_operand:V2DI 1 "vector_operand"))
+ (match_dup 2)))]
"TARGET_AVX512FP16 && TARGET_AVX512VL"
"operands[2] = CONST0_RTX (V6HFmode);")
@@ -6000,6 +6246,12 @@
(set_attr "prefix" "evex")
(set_attr "mode" "HF")])
+(define_expand "fix<fixunssuffix>_trunc<ssePHmodelower><mode>2"
+ [(set (match_operand:VI2H_AVX512VL 0 "register_operand")
+ (any_fix:VI2H_AVX512VL
+ (match_operand:<ssePHmode> 1 "nonimmediate_operand")))]
+ "TARGET_AVX512FP16")
+
(define_insn "avx512fp16_fix<fixunssuffix>_trunc<mode>2<mask_name><round_saeonly_name>"
[(set (match_operand:VI2H_AVX512VL 0 "register_operand" "=v")
(any_fix:VI2H_AVX512VL
@@ -6010,6 +6262,21 @@
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
+(define_expand "fix<fixunssuffix>_truncv4hf<mode>2"
+ [(set (match_operand:VI4_128_8_256 0 "register_operand")
+ (any_fix:VI4_128_8_256
+ (match_operand:V4HF 1 "nonimmediate_operand")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+ if (!MEM_P (operands[1]))
+ {
+ operands[1] = lowpart_subreg (V8HFmode, operands[1], V4HFmode);
+ emit_insn (gen_avx512fp16_fix<fixunssuffix>_trunc<mode>2 (operands[0],
+ operands[1]));
+ DONE;
+ }
+})
+
(define_insn "avx512fp16_fix<fixunssuffix>_trunc<mode>2<mask_name>"
[(set (match_operand:VI4_128_8_256 0 "register_operand" "=v")
(any_fix:VI4_128_8_256
@@ -6032,6 +6299,21 @@
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
+(define_expand "fix<fixunssuffix>_truncv2hfv2di2"
+ [(set (match_operand:V2DI 0 "register_operand")
+ (any_fix:V2DI
+ (match_operand:V2HF 1 "nonimmediate_operand")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+ if (!MEM_P (operands[1]))
+ {
+ operands[1] = lowpart_subreg (V8HFmode, operands[1], V2HFmode);
+ emit_insn (gen_avx512fp16_fix<fixunssuffix>_truncv2di2 (operands[0],
+ operands[1]));
+ DONE;
+ }
+})
+
(define_insn "avx512fp16_fix<fixunssuffix>_truncv2di2<mask_name>"
[(set (match_operand:V2DI 0 "register_operand" "=v")
(any_fix:V2DI
@@ -6080,6 +6362,12 @@
[(V16SF "x") (V8SF "x") (V4SF "x")
(V8DF "") (V4DF "") (V2DF "")])
+(define_expand "extend<ssePHmodelower><mode>2"
+ [(set (match_operand:VF48H_AVX512VL 0 "register_operand")
+ (float_extend:VF48H_AVX512VL
+ (match_operand:<ssePHmode> 1 "nonimmediate_operand")))]
+ "TARGET_AVX512FP16")
+
(define_insn "avx512fp16_float_extend_ph<mode>2<mask_name><round_saeonly_name>"
[(set (match_operand:VF48H_AVX512VL 0 "register_operand" "=v")
(float_extend:VF48H_AVX512VL
@@ -6090,6 +6378,21 @@
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
+(define_expand "extendv4hf<mode>2"
+ [(set (match_operand:VF4_128_8_256 0 "register_operand")
+ (float_extend:VF4_128_8_256
+ (match_operand:V4HF 1 "nonimmediate_operand")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+ if (!MEM_P (operands[1]))
+ {
+ operands[1] = lowpart_subreg (V8HFmode, operands[1], V4HFmode);
+ emit_insn (gen_avx512fp16_float_extend_ph<mode>2
+ (operands[0], operands[1]));
+ DONE;
+ }
+})
+
(define_insn "avx512fp16_float_extend_ph<mode>2<mask_name>"
[(set (match_operand:VF4_128_8_256 0 "register_operand" "=v")
(float_extend:VF4_128_8_256
@@ -6112,6 +6415,21 @@
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
+(define_expand "extendv2hfv2df2"
+ [(set (match_operand:V2DF 0 "register_operand")
+ (float_extend:V2DF
+ (match_operand:V2HF 1 "nonimmediate_operand")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+ if (!MEM_P (operands[1]))
+ {
+ operands[1] = lowpart_subreg (V8HFmode, operands[1], V2HFmode);
+ emit_insn (gen_avx512fp16_float_extend_phv2df2
+ (operands[0], operands[1]));
+ DONE;
+ }
+})
+
(define_insn "avx512fp16_float_extend_phv2df2<mask_name>"
[(set (match_operand:V2DF 0 "register_operand" "=v")
(float_extend:V2DF
@@ -6134,6 +6452,12 @@
(set_attr "prefix" "evex")
(set_attr "mode" "TI")])
+(define_expand "trunc<mode><ssePHmodelower>2"
+ [(set (match_operand:<ssePHmode> 0 "register_operand")
+ (float_truncate:<ssePHmode>
+ (match_operand:VF48H_AVX512VL 1 "nonimmediate_operand")))]
+ "TARGET_AVX512FP16")
+
(define_insn "avx512fp16_vcvt<castmode>2ph_<mode><mask_name><round_name>"
[(set (match_operand:<ssePHmode> 0 "register_operand" "=v")
(float_truncate:<ssePHmode>
@@ -6144,11 +6468,21 @@
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
-(define_expand "avx512fp16_vcvt<castmode>2ph_<mode>"
- [(set (match_operand:V8HF 0 "register_operand" "=v")
+(define_expand "trunc<mode>v4hf2"
+ [(set (match_operand:V4HF 0 "register_operand")
+ (float_truncate:V4HF (match_operand:VF4_128_8_256 1 "vector_operand")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+ operands[0] = lowpart_subreg (V8HFmode, operands[0], V4HFmode);
+ emit_insn (gen_avx512fp16_trunc<mode>v4hf2 (operands[0], operands[1]));
+ DONE;
+})
+
+(define_expand "avx512fp16_trunc<mode>v4hf2"
+ [(set (match_operand:V8HF 0 "register_operand")
(vec_concat:V8HF
(float_truncate:V4HF
- (match_operand:VF4_128_8_256 1 "vector_operand" "vm"))
+ (match_operand:VF4_128_8_256 1 "vector_operand"))
(match_dup 2)))]
"TARGET_AVX512FP16 && TARGET_AVX512VL"
"operands[2] = CONST0_RTX (V4HFmode);")
@@ -6213,11 +6547,20 @@
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
-(define_expand "avx512fp16_vcvtpd2ph_v2df"
- [(set (match_operand:V8HF 0 "register_operand" "=v")
+(define_expand "truncv2dfv2hf2"
+ [(set (match_operand:V2HF 0 "register_operand")
+ (float_truncate:V2HF (match_operand:V2DF 1 "vector_operand")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+ operands[0] = lowpart_subreg (V8HFmode, operands[0], V2HFmode);
+ emit_insn (gen_avx512fp16_truncv2dfv2hf2 (operands[0], operands[1]));
+ DONE;
+})
+
+(define_expand "avx512fp16_truncv2dfv2hf2"
+ [(set (match_operand:V8HF 0 "register_operand")
(vec_concat:V8HF
- (float_truncate:V2HF
- (match_operand:V2DF 1 "vector_operand" "vm"))
+ (float_truncate:V2HF (match_operand:V2DF 1 "vector_operand"))
(match_dup 2)))]
"TARGET_AVX512FP16 && TARGET_AVX512VL"
"operands[2] = CONST0_RTX (V6HFmode);")
@@ -15229,6 +15572,21 @@
DONE;
})
+(define_expand "vcondu<mode><sseintvecmodelower>"
+ [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
+ (if_then_else:VF_AVX512FP16VL
+ (match_operator 3 ""
+ [(match_operand:<sseintvecmode> 4 "vector_operand")
+ (match_operand:<sseintvecmode> 5 "vector_operand")])
+ (match_operand:VF_AVX512FP16VL 1 "general_operand")
+ (match_operand:VF_AVX512FP16VL 2 "general_operand")))]
+ "TARGET_AVX512FP16"
+{
+ bool ok = ix86_expand_int_vcond (operands);
+ gcc_assert (ok);
+ DONE;
+})
+
(define_expand "vcondeq<VI8F_128:mode>v2di"
[(set (match_operand:VI8F_128 0 "register_operand")
(if_then_else:VI8F_128
@@ -21298,14 +21656,14 @@
(set_attr "mode" "<MODE>")])
(define_insn "*sse4_1_round<ssescalarmodesuffix>"
- [(set (match_operand:VF_128 0 "register_operand" "=Yr,*x,x,v")
- (vec_merge:VF_128
- (vec_duplicate:VF_128
+ [(set (match_operand:VFH_128 0 "register_operand" "=Yr,*x,x,v")
+ (vec_merge:VFH_128
+ (vec_duplicate:VFH_128
(unspec:<ssescalarmode>
[(match_operand:<ssescalarmode> 2 "nonimmediate_operand" "Yrm,*xm,xm,vm")
(match_operand:SI 3 "const_0_to_15_operand" "n,n,n,n")]
UNSPEC_ROUND))
- (match_operand:VF_128 1 "register_operand" "0,0,x,v")
+ (match_operand:VFH_128 1 "register_operand" "0,0,x,v")
(const_int 1)))]
"TARGET_SSE4_1"
"@
diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md
index 157d49f..11e62c6 100644
--- a/gcc/config/i386/subst.md
+++ b/gcc/config/i386/subst.md
@@ -28,6 +28,9 @@
V16SF V8SF V4SF
V8DF V4DF V2DF])
+(define_mode_iterator SUBST_CV
+ [V32HF V16HF V8HF])
+
(define_mode_iterator SUBST_S
[QI HI SI DI])
@@ -42,9 +45,11 @@
QI HI SI DI SF DF])
(define_subst_attr "mask_name" "mask" "" "_mask")
+(define_subst_attr "maskc_name" "maskc" "" "_mask")
(define_subst_attr "mask_applied" "mask" "false" "true")
(define_subst_attr "mask_operand2" "mask" "" "%{%3%}%N2")
(define_subst_attr "mask_operand3" "mask" "" "%{%4%}%N3")
+(define_subst_attr "maskc_operand3" "maskc" "" "%{%4%}%N3")
(define_subst_attr "mask_operand3_1" "mask" "" "%%{%%4%%}%%N3") ;; for sprintf
(define_subst_attr "mask_operand4" "mask" "" "%{%5%}%N4")
(define_subst_attr "mask_operand6" "mask" "" "%{%7%}%N6")
@@ -89,6 +94,18 @@
(match_dup 0)
(match_operand:<avx512fmaskmode> 2 "register_operand" "Yk")))])
+(define_subst "maskc"
+ [(set (match_operand:SUBST_CV 0)
+ (match_operand:SUBST_CV 1))]
+ "TARGET_AVX512F"
+ [(set (match_dup 0)
+ (vec_merge:SUBST_CV
+ (match_dup 1)
+ (match_operand:SUBST_CV 2 "nonimm_or_0_operand" "0C")
+ (unspec:<avx512fmaskmode>
+ [(match_operand:<avx512fmaskcmode> 3 "register_operand" "Yk")]
+ UNSPEC_COMPLEX_MASK)))])
+
(define_subst_attr "mask_scalar_merge_name" "mask_scalar_merge" "" "_mask")
(define_subst_attr "mask_scalar_merge_operand3" "mask_scalar_merge" "" "%{%3%}")
(define_subst_attr "mask_scalar_merge_operand4" "mask_scalar_merge" "" "%{%4%}")
@@ -137,12 +154,31 @@
(match_operand:<avx512fmaskmode> 4 "register_operand" "Yk"))
(match_dup 2)
(const_int 1)))])
+(define_subst_attr "sdc_maskz_name" "sdc" "" "_maskz_1")
+(define_subst_attr "sdc_mask_op4" "sdc" "" "%{%5%}%N4")
+(define_subst_attr "sdc_mask_op5" "sdc" "" "%{%6%}%N5")
+(define_subst_attr "sdc_mask_mode512bit_condition" "sdc" "1" "(<MODE_SIZE> == 64 || TARGET_AVX512VL)")
+
+(define_subst "sdc"
+ [(set (match_operand:SUBST_CV 0)
+ (match_operand:SUBST_CV 1))]
+ ""
+ [(set (match_dup 0)
+ (vec_merge:SUBST_CV
+ (match_dup 1)
+ (match_operand:SUBST_CV 2 "const0_operand" "C")
+ (unspec:<avx512fmaskmode>
+ [(match_operand:<avx512fmaskcmode> 3 "register_operand" "Yk")]
+ UNSPEC_COMPLEX_MASK)))
+])
(define_subst_attr "round_name" "round" "" "_round")
(define_subst_attr "round_mask_operand2" "mask" "%R2" "%R4")
(define_subst_attr "round_mask_operand3" "mask" "%R3" "%R5")
+(define_subst_attr "round_maskc_operand3" "maskc" "%R3" "%R5")
(define_subst_attr "round_mask_operand4" "mask" "%R4" "%R6")
(define_subst_attr "round_sd_mask_operand4" "sd" "%R4" "%R6")
+(define_subst_attr "round_sdc_mask_operand4" "sdc" "%R4" "%R6")
(define_subst_attr "round_op2" "round" "" "%R2")
(define_subst_attr "round_op3" "round" "" "%R3")
(define_subst_attr "round_op4" "round" "" "%R4")
@@ -150,8 +186,10 @@
(define_subst_attr "round_op6" "round" "" "%R6")
(define_subst_attr "round_mask_op2" "round" "" "<round_mask_operand2>")
(define_subst_attr "round_mask_op3" "round" "" "<round_mask_operand3>")
+(define_subst_attr "round_maskc_op3" "round" "" "<round_maskc_operand3>")
(define_subst_attr "round_mask_op4" "round" "" "<round_mask_operand4>")
(define_subst_attr "round_sd_mask_op4" "round" "" "<round_sd_mask_operand4>")
+(define_subst_attr "round_sdc_mask_op4" "round" "" "<round_sdc_mask_operand4>")
(define_subst_attr "round_constraint" "round" "vm" "v")
(define_subst_attr "round_qq2phsuff" "round" "<qq2phsuff>" "")
(define_subst_attr "bcst_round_constraint" "round" "vmBr" "v")
@@ -189,6 +227,7 @@
(define_subst_attr "round_saeonly_mask_scalar_merge_operand4" "mask_scalar_merge" "%r4" "%r5")
(define_subst_attr "round_saeonly_maskz_scalar_operand5" "maskz_scalar" "%r5" "%r7")
(define_subst_attr "round_saeonly_sd_mask_operand5" "sd" "%r5" "%r7")
+(define_subst_attr "round_saeonly_sdc_mask_operand5" "sdc" "%r5" "%r7")
(define_subst_attr "round_saeonly_op2" "round_saeonly" "" "%r2")
(define_subst_attr "round_saeonly_op3" "round_saeonly" "" "%r3")
(define_subst_attr "round_saeonly_op4" "round_saeonly" "" "%r4")
@@ -289,8 +328,12 @@
(match_operand:<avx512fmaskmode> 5 "register_operand")])
(define_subst_attr "mask_scalar_name" "mask_scalar" "" "_mask")
+(define_subst_attr "mask_scalarcz_name" "mask_scalarcz" "" "_maskz")
+(define_subst_attr "mask_scalarc_name" "mask_scalarc" "" "_mask")
+(define_subst_attr "mask_scalarc_operand3" "mask_scalarc" "" "%{%4%}%N3")
(define_subst_attr "mask_scalar_operand3" "mask_scalar" "" "%{%4%}%N3")
(define_subst_attr "mask_scalar_operand4" "mask_scalar" "" "%{%5%}%N4")
+(define_subst_attr "mask_scalarcz_operand4" "mask_scalarcz" "" "%{%5%}%N4")
(define_subst "mask_scalar"
[(set (match_operand:SUBST_V 0)
@@ -308,12 +351,55 @@
(match_dup 2)
(const_int 1)))])
+(define_subst "mask_scalarcz"
+ [(set (match_operand:SUBST_CV 0)
+ (vec_merge:SUBST_CV
+ (match_operand:SUBST_CV 1)
+ (match_operand:SUBST_CV 2)
+ (const_int 3)))]
+ "TARGET_AVX512F"
+ [(set (match_dup 0)
+ (vec_merge:SUBST_CV
+ (vec_merge:SUBST_CV
+ (match_dup 1)
+ (match_operand:SUBST_CV 3 "const0_operand" "C")
+ (unspec:<avx512fmaskmode>
+ [(match_operand:<avx512fmaskcmode> 4 "register_operand" "Yk")]
+ UNSPEC_COMPLEX_MASK))
+ (match_dup 2)
+ (const_int 3)))])
+
+(define_subst "mask_scalarc"
+ [(set (match_operand:SUBST_CV 0)
+ (vec_merge:SUBST_CV
+ (match_operand:SUBST_CV 1)
+ (match_operand:SUBST_CV 2)
+ (const_int 3)))]
+ "TARGET_AVX512F"
+ [(set (match_dup 0)
+ (vec_merge:SUBST_CV
+ (vec_merge:SUBST_CV
+ (match_dup 1)
+ (match_operand:SUBST_CV 3 "nonimm_or_0_operand" "0C")
+ (unspec:<avx512fmaskmode>
+ [(match_operand:<avx512fmaskcmode> 4 "register_operand" "Yk")]
+ UNSPEC_COMPLEX_MASK))
+ (match_dup 2)
+ (const_int 3)))])
+
(define_subst_attr "round_scalar_name" "round_scalar" "" "_round")
+(define_subst_attr "round_scalarcz_name" "round_scalarcz" "" "_round")
(define_subst_attr "round_scalar_mask_operand3" "mask_scalar" "%R3" "%R5")
+(define_subst_attr "round_scalarc_mask_operand3" "mask_scalarc" "%R3" "%R5")
+(define_subst_attr "round_scalarcz_mask_operand4" "mask_scalarcz" "%R4" "%R6")
(define_subst_attr "round_scalar_mask_op3" "round_scalar" "" "<round_scalar_mask_operand3>")
+(define_subst_attr "round_scalarc_mask_op3" "round_scalarcz" "" "<round_scalarc_mask_operand3>")
+(define_subst_attr "round_scalarcz_mask_op4" "round_scalarcz" "" "<round_scalarcz_mask_operand4>")
(define_subst_attr "round_scalar_constraint" "round_scalar" "vm" "v")
+(define_subst_attr "round_scalarcz_constraint" "round_scalarcz" "vm" "v")
(define_subst_attr "round_scalar_prefix" "round_scalar" "vex" "evex")
(define_subst_attr "round_scalar_nimm_predicate" "round_scalar" "nonimmediate_operand" "register_operand")
+(define_subst_attr "round_scalarcz_nimm_predicate" "round_scalarcz" "vector_operand" "register_operand")
(define_subst "round_scalar"
[(set (match_operand:SUBST_V 0)
@@ -331,6 +417,22 @@
(match_operand:SI 3 "const_4_or_8_to_11_operand")]
UNSPEC_EMBEDDED_ROUNDING))])
+(define_subst "round_scalarcz"
+ [(set (match_operand:SUBST_V 0)
+ (vec_merge:SUBST_V
+ (match_operand:SUBST_V 1)
+ (match_operand:SUBST_V 2)
+ (const_int 3)))]
+ "TARGET_AVX512F"
+ [(set (match_dup 0)
+ (unspec:SUBST_V [
+ (vec_merge:SUBST_V
+ (match_dup 1)
+ (match_dup 2)
+ (const_int 3))
+ (match_operand:SI 3 "const_4_or_8_to_11_operand")]
+ UNSPEC_EMBEDDED_ROUNDING))])
+
(define_subst_attr "round_saeonly_scalar_name" "round_saeonly_scalar" "" "_round")
(define_subst_attr "round_saeonly_scalar_mask_operand3" "mask_scalar" "%r3" "%r5")
(define_subst_attr "round_saeonly_scalar_mask_operand4" "mask_scalar" "%r4" "%r6")
diff --git a/gcc/config/lm32/uclinux-elf.h b/gcc/config/lm32/uclinux-elf.h
index 370df4c5..5b638fa 100644
--- a/gcc/config/lm32/uclinux-elf.h
+++ b/gcc/config/lm32/uclinux-elf.h
@@ -67,6 +67,7 @@
#define TARGET_OS_CPP_BUILTINS() GNU_USER_TARGET_OS_CPP_BUILTINS()
+#undef LINK_GCC_C_SEQUENCE_SPEC
#define LINK_GCC_C_SEQUENCE_SPEC \
"%{static|static-pie:--start-group} %G %{!nolibc:%L} \
%{static|static-pie:--end-group}%{!static:%{!static-pie:%G}}"
diff --git a/gcc/config/pa/pa.c b/gcc/config/pa/pa.c
index 0614302..69ba5bd 100644
--- a/gcc/config/pa/pa.c
+++ b/gcc/config/pa/pa.c
@@ -541,6 +541,16 @@ pa_option_override (void)
write_symbols = NO_DEBUG;
}
+ if (TARGET_64BIT && TARGET_HPUX)
+ {
+ /* DWARF5 is not supported by gdb. Don't emit DWARF5 unless
+ specifically selected. */
+ if (!global_options_set.x_dwarf_strict)
+ dwarf_strict = 1;
+ if (!global_options_set.x_dwarf_version)
+ dwarf_version = 4;
+ }
+
/* We only support the "big PIC" model now. And we always generate PIC
code when in 64bit mode. */
if (flag_pic == 1 || TARGET_64BIT)
diff --git a/gcc/config/pru/constraints.md b/gcc/config/pru/constraints.md
index a31ae93..1e0e703 100644
--- a/gcc/config/pru/constraints.md
+++ b/gcc/config/pru/constraints.md
@@ -34,6 +34,7 @@
;; The following constraints are intended for internal use only:
;; Rmd0, Rms0, Rms1: Registers for MUL instruction operands.
;; Rsib: Jump address register suitable for sibling calls.
+;; Rrio: The R30 and R31 I/O registers.
;; M: -255 to 0 (for converting ADD to SUB with suitable UBYTE OP2).
;; N: -32768 to 32767 (16-bit signed integer).
;; O: -128 to 127 (8-bit signed integer).
@@ -57,6 +58,10 @@
"@internal
The multiply source 1 register.")
+(define_register_constraint "Rrio" "REGIO_REGS"
+ "@internal
+ The R30 and R31 I/O registers.")
+
;; Integer constraints.
(define_constraint "I"
diff --git a/gcc/config/pru/predicates.md b/gcc/config/pru/predicates.md
index 469002f..1a4b98e 100644
--- a/gcc/config/pru/predicates.md
+++ b/gcc/config/pru/predicates.md
@@ -121,6 +121,25 @@
return 0;
})
+(define_predicate "regio_operand"
+ (match_code "subreg,reg")
+{
+ if (register_operand (op, mode))
+ {
+ int regno;
+
+ if (REG_P (op))
+ regno = REGNO (op);
+ else if (GET_CODE (op) == SUBREG && REG_P (SUBREG_REG (op)))
+ regno = REGNO (SUBREG_REG (op));
+ else
+ return 0;
+
+ return REGNO_REG_CLASS (regno) == REGIO_REGS;
+ }
+ return 0;
+})
+
(define_predicate "reg_or_const_int_operand"
(ior (match_operand 0 "const_int_operand")
(match_operand 0 "register_operand")))
diff --git a/gcc/config/pru/pru-pragma.c b/gcc/config/pru/pru-pragma.c
index 01d0761..3beec23 100644
--- a/gcc/config/pru/pru-pragma.c
+++ b/gcc/config/pru/pru-pragma.c
@@ -83,4 +83,6 @@ pru_register_pragmas (void)
{
c_register_pragma (NULL, "ctable_entry", pru_pragma_ctable_entry);
c_register_pragma (NULL, "CTABLE_ENTRY", pru_pragma_ctable_entry);
+
+ c_register_addr_space ("__regio_symbol", ADDR_SPACE_REGIO);
}
diff --git a/gcc/config/pru/pru-protos.h b/gcc/config/pru/pru-protos.h
index 74129e9..031ea9e 100644
--- a/gcc/config/pru/pru-protos.h
+++ b/gcc/config/pru/pru-protos.h
@@ -62,7 +62,10 @@ extern int pru_get_ctable_exact_base_index (unsigned HOST_WIDE_INT caddr);
extern int pru_get_ctable_base_index (unsigned HOST_WIDE_INT caddr);
extern int pru_get_ctable_base_offset (unsigned HOST_WIDE_INT caddr);
+extern int pru_symref2ioregno (rtx op);
+
extern void pru_register_abicheck_pass (void);
+
#endif /* RTX_CODE */
#ifdef TREE_CODE
diff --git a/gcc/config/pru/pru.c b/gcc/config/pru/pru.c
index 30d0da1..9f264b4 100644
--- a/gcc/config/pru/pru.c
+++ b/gcc/config/pru/pru.c
@@ -1403,11 +1403,42 @@ pru_valid_addr_expr_p (machine_mode mode, rtx base, rtx offset, bool strict_p)
return false;
}
-/* Implement TARGET_LEGITIMATE_ADDRESS_P. */
+/* Return register number (either for r30 or r31) which maps to the
+ corresponding symbol OP's name in the __regio_symbol address namespace.
+
+ If no mapping can be established (i.e. symbol name is invalid), then
+ return -1. */
+int pru_symref2ioregno (rtx op)
+{
+ if (!SYMBOL_REF_P (op))
+ return -1;
+
+ const char *name = XSTR (op, 0);
+ if (!strcmp (name, "__R30"))
+ return R30_REGNUM;
+ else if (!strcmp (name, "__R31"))
+ return R31_REGNUM;
+ else
+ return -1;
+}
+
+/* Implement TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P. */
static bool
-pru_legitimate_address_p (machine_mode mode,
- rtx operand, bool strict_p)
+pru_addr_space_legitimate_address_p (machine_mode mode, rtx operand,
+ bool strict_p, addr_space_t as)
{
+ if (as == ADDR_SPACE_REGIO)
+ {
+ /* Address space constraints for __regio_symbol have been checked in
+ TARGET_INSERT_ATTRIBUTES, and some more checks will be done
+ during RTL expansion of "mov<mode>". */
+ return true;
+ }
+ else if (as != ADDR_SPACE_GENERIC)
+ {
+ gcc_unreachable ();
+ }
+
switch (GET_CODE (operand))
{
/* Direct. */
@@ -2002,6 +2033,117 @@ pru_file_start (void)
need to confuse users with this warning. */
fprintf (asm_out_file, "\t.set no_warn_regname_label\n");
}
+
+/* Scan type TYP for pointer references to address space other than
+ ADDR_SPACE_GENERIC. Return true if such reference is found.
+ Much of this code was taken from the avr port. */
+
+static bool
+pru_nongeneric_pointer_addrspace (tree typ)
+{
+ while (ARRAY_TYPE == TREE_CODE (typ))
+ typ = TREE_TYPE (typ);
+
+ if (POINTER_TYPE_P (typ))
+ {
+ addr_space_t as;
+ tree target = TREE_TYPE (typ);
+
+ /* Pointer to function: Test the function's return type. */
+ if (FUNCTION_TYPE == TREE_CODE (target))
+ return pru_nongeneric_pointer_addrspace (TREE_TYPE (target));
+
+ /* "Ordinary" pointers... */
+
+ while (TREE_CODE (target) == ARRAY_TYPE)
+ target = TREE_TYPE (target);
+
+ as = TYPE_ADDR_SPACE (target);
+
+ if (!ADDR_SPACE_GENERIC_P (as))
+ return true;
+
+ /* Scan pointer's target type. */
+ return pru_nongeneric_pointer_addrspace (target);
+ }
+
+ return false;
+}
+
+/* Implement `TARGET_INSERT_ATTRIBUTES'. For PRU it's used as a hook to
+ provide better diagnostics for some invalid usages of the __regio_symbol
+ address space.
+
+ Any escapes of the following checks are supposed to be caught
+ during the "mov<mode>" pattern expansion. */
+
+static void
+pru_insert_attributes (tree node, tree *attributes ATTRIBUTE_UNUSED)
+{
+
+ /* Validate __regio_symbol variable declarations. */
+ if (VAR_P (node))
+ {
+ const char *name = DECL_NAME (node)
+ ? IDENTIFIER_POINTER (DECL_NAME (node))
+ : "<unknown>";
+ tree typ = TREE_TYPE (node);
+ addr_space_t as = TYPE_ADDR_SPACE (typ);
+
+ if (as == ADDR_SPACE_GENERIC)
+ return;
+
+ if (AGGREGATE_TYPE_P (typ))
+ {
+ error ("aggregate types are prohibited in "
+ "%<__regio_symbol%> address space");
+ /* Don't bother anymore. Below checks would pile
+ meaningless errors, which would confuse user. */
+ return;
+ }
+ if (DECL_INITIAL (node) != NULL_TREE)
+ error ("variables in %<__regio_symbol%> address space "
+ "cannot have initial value");
+ if (DECL_REGISTER (node))
+ error ("variables in %<__regio_symbol%> address space "
+ "cannot be declared %<register%>");
+ if (!TYPE_VOLATILE (typ))
+ error ("variables in %<__regio_symbol%> address space "
+ "must be declared %<volatile%>");
+ if (!DECL_EXTERNAL (node))
+ error ("variables in %<__regio_symbol%> address space "
+ "must be declared %<extern%>");
+ if (TYPE_MODE (typ) != SImode)
+ error ("only 32-bit access is supported "
+ "for %<__regio_symbol%> address space");
+ if (strcmp (name, "__R30") != 0 && strcmp (name, "__R31") != 0)
+ error ("register name %<%s%> not recognized "
+ "in %<__regio_symbol%> address space", name);
+ }
+
+ tree typ = NULL_TREE;
+
+ switch (TREE_CODE (node))
+ {
+ case FUNCTION_DECL:
+ typ = TREE_TYPE (TREE_TYPE (node));
+ break;
+ case TYPE_DECL:
+ case RESULT_DECL:
+ case VAR_DECL:
+ case FIELD_DECL:
+ case PARM_DECL:
+ typ = TREE_TYPE (node);
+ break;
+ case POINTER_TYPE:
+ typ = node;
+ break;
+ default:
+ break;
+ }
+ if (typ != NULL_TREE && pru_nongeneric_pointer_addrspace (typ))
+ error ("pointers to %<__regio_symbol%> address space are prohibited");
+}
/* Function argument related. */
@@ -2933,6 +3075,9 @@ pru_unwind_word_mode (void)
#undef TARGET_ASM_FILE_START
#define TARGET_ASM_FILE_START pru_file_start
+#undef TARGET_INSERT_ATTRIBUTES
+#define TARGET_INSERT_ATTRIBUTES pru_insert_attributes
+
#undef TARGET_INIT_BUILTINS
#define TARGET_INIT_BUILTINS pru_init_builtins
#undef TARGET_EXPAND_BUILTIN
@@ -2979,8 +3124,9 @@ pru_unwind_word_mode (void)
#undef TARGET_MUST_PASS_IN_STACK
#define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
-#undef TARGET_LEGITIMATE_ADDRESS_P
-#define TARGET_LEGITIMATE_ADDRESS_P pru_legitimate_address_p
+#undef TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P
+#define TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P \
+ pru_addr_space_legitimate_address_p
#undef TARGET_INIT_LIBFUNCS
#define TARGET_INIT_LIBFUNCS pru_init_libfuncs
diff --git a/gcc/config/pru/pru.h b/gcc/config/pru/pru.h
index 9b6be32..03f08b1 100644
--- a/gcc/config/pru/pru.h
+++ b/gcc/config/pru/pru.h
@@ -215,6 +215,7 @@ enum reg_class
MULDST_REGS,
MULSRC0_REGS,
MULSRC1_REGS,
+ REGIO_REGS,
GP_REGS,
ALL_REGS,
LIM_REG_CLASSES
@@ -229,6 +230,7 @@ enum reg_class
"MULDST_REGS", \
"MULSRC0_REGS", \
"MULSRC1_REGS", \
+ "REGIO_REGS", \
"GP_REGS", \
"ALL_REGS" }
@@ -242,6 +244,7 @@ enum reg_class
/* MULDST_REGS */ { 0, 0, 0, 0x00000f00, 0}, \
/* MULSRC0_REGS */ { 0, 0, 0, 0x000f0000, 0}, \
/* MULSRC1_REGS */ { 0, 0, 0, 0x00f00000, 0}, \
+ /* REGIO_REGS */ { 0, 0, 0, 0xff000000, 0}, \
/* GP_REGS */ { ~0, ~0, ~0, ~0, 0}, \
/* ALL_REGS */ { ~0,~0, ~0, ~0, ~0} \
}
@@ -252,6 +255,8 @@ enum reg_class
((REGNO) == MULDST_REGNUM ? MULDST_REGS \
: (REGNO) == MULSRC0_REGNUM ? MULSRC0_REGS \
: (REGNO) == MULSRC1_REGNUM ? MULSRC1_REGS \
+ : (REGNO) == R30_REGNUM ? REGIO_REGS \
+ : (REGNO) == R31_REGNUM ? REGIO_REGS \
: (REGNO) >= FIRST_ARG_REGNUM \
&& (REGNO) <= LAST_ARG_REGNUM ? SIB_REGS \
: (REGNO) == STATIC_CHAIN_REGNUM ? SIB_REGS \
diff --git a/gcc/config/pru/pru.md b/gcc/config/pru/pru.md
index e6cfa8e..c0ded8e 100644
--- a/gcc/config/pru/pru.md
+++ b/gcc/config/pru/pru.md
@@ -36,6 +36,8 @@
(MULSRC0_REGNUM 112) ; Multiply source register.
(MULSRC1_REGNUM 116) ; Multiply source register.
(LAST_NONIO_GP_REGNUM 119) ; Last non-I/O general purpose register.
+ (R30_REGNUM 120) ; R30 I/O register.
+ (R31_REGNUM 124) ; R31 I/O register.
(LOOPCNTR_REGNUM 128) ; internal LOOP counter register
(LAST_GP_REGNUM 132) ; Last general purpose register.
@@ -49,6 +51,13 @@
]
)
+;; Enumerate address spaces.
+(define_constants
+ [
+ (ADDR_SPACE_REGIO 1) ; Access to R30 and R31 I/O registers.
+ ]
+)
+
;; Enumeration of UNSPECs.
(define_c_enum "unspec" [
@@ -68,6 +77,9 @@
UNSPECV_HALT
UNSPECV_BLOCKAGE
+
+ UNSPECV_REGIO_READ
+ UNSPECV_REGIO_WRITE
])
; Length of an instruction (in bytes).
@@ -129,11 +141,62 @@
(match_operand:MOV8_16_32 1 "general_operand"))]
""
{
- /* It helps to split constant loading and memory access
- early, so that the LDI/LDI32 instructions can be hoisted
- outside a loop body. */
- if (MEM_P (operands[0]))
- operands[1] = force_reg (<MODE>mode, operands[1]);
+ if (MEM_P (operands[0])
+ && MEM_ADDR_SPACE (operands[0]) == ADDR_SPACE_REGIO)
+
+ {
+ /* Intercept writes to the SImode register I/O "address space". */
+ gcc_assert (<MODE>mode == SImode);
+
+ if (!SYMBOL_REF_P (XEXP (operands[0], 0)))
+ {
+ error ("invalid access to %<__regio_symbol%> address space");
+ FAIL;
+ }
+
+ if (!REG_P (operands[1]))
+ operands[1] = force_reg (<MODE>mode, operands[1]);
+
+ int regiono = pru_symref2ioregno (XEXP (operands[0], 0));
+ gcc_assert (regiono >= 0);
+ rtx regio = gen_rtx_REG (<MODE>mode, regiono);
+ rtx unspecv = gen_rtx_UNSPEC_VOLATILE (<MODE>mode,
+ gen_rtvec (1, operands[1]),
+ UNSPECV_REGIO_WRITE);
+ emit_insn (gen_rtx_SET (regio, unspecv));
+ DONE;
+ }
+ else if (MEM_P (operands[1])
+ && MEM_ADDR_SPACE (operands[1]) == ADDR_SPACE_REGIO)
+ {
+ /* Intercept reads from the SImode register I/O "address space". */
+ gcc_assert (<MODE>mode == SImode);
+
+ if (!SYMBOL_REF_P (XEXP (operands[1], 0)))
+ {
+ error ("invalid access to %<__regio_symbol%> address space");
+ FAIL;
+ }
+
+ if (MEM_P (operands[0]))
+ operands[0] = force_reg (<MODE>mode, operands[0]);
+
+ int regiono = pru_symref2ioregno (XEXP (operands[1], 0));
+ gcc_assert (regiono >= 0);
+ rtx regio = gen_rtx_REG (<MODE>mode, regiono);
+ rtx unspecv = gen_rtx_UNSPEC_VOLATILE (<MODE>mode,
+ gen_rtvec (1, regio),
+ UNSPECV_REGIO_READ);
+ emit_insn (gen_rtx_SET (operands[0], unspecv));
+ DONE;
+ }
+ else if (MEM_P (operands[0]))
+ {
+ /* It helps to split constant loading and memory access
+ early, so that the LDI/LDI32 instructions can be hoisted
+ outside a loop body. */
+ operands[1] = force_reg (<MODE>mode, operands[1]);
+ }
})
;; Keep a single pattern for 32 bit MOV operations. LRA requires that the
@@ -546,6 +609,35 @@
(include "alu-zext.md")
+;; Patterns for accessing the R30/R31 I/O registers.
+
+(define_insn "*regio_readsi"
+ [(set (match_operand:SI 0 "register_operand" "=r")
+ (unspec_volatile:SI
+ [(match_operand:SI 1 "regio_operand" "Rrio")]
+ UNSPECV_REGIO_READ))]
+ ""
+ "mov\\t%0, %1"
+ [(set_attr "type" "alu")])
+
+(define_insn "*regio_nozext_writesi"
+ [(set (match_operand:SI 0 "regio_operand" "=Rrio")
+ (unspec_volatile:SI
+ [(match_operand:SI 1 "register_operand" "r")]
+ UNSPECV_REGIO_WRITE))]
+ ""
+ "mov\\t%0, %1"
+ [(set_attr "type" "alu")])
+
+(define_insn "*regio_zext_write_r30<EQS0:mode>"
+ [(set (match_operand:SI 0 "regio_operand" "=Rrio")
+ (unspec_volatile:SI
+ [(zero_extend:SI (match_operand:EQS0 1 "register_operand" "r"))]
+ UNSPECV_REGIO_WRITE))]
+ ""
+ "mov\\t%0, %1"
+ [(set_attr "type" "alu")])
+
;; DI logical ops could be automatically split into WORD-mode ops in
;; expand_binop(). But then we'll miss an opportunity to use SI mode
;; operations, since WORD mode for PRU is QI.
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index f88877f..98364f0 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -802,7 +802,7 @@
rtx hp = gen_reg_rtx (<MODE>mode);
rtx lp = gen_reg_rtx (<MODE>mode);
- emit_insn (gen_mul<mode>3_highpart (hp, operands[1], operands[2]));
+ emit_insn (gen_smul<mode>3_highpart (hp, operands[1], operands[2]));
emit_insn (gen_mul<mode>3 (operands[0], operands[1], operands[2]));
emit_insn (gen_ashr<mode>3 (lp, operands[0],
GEN_INT (BITS_PER_WORD - 1)));
@@ -899,14 +899,14 @@
emit_insn (gen_muldi3 (low, operands[1], operands[2]));
rtx high = gen_reg_rtx (DImode);
- emit_insn (gen_<u>muldi3_highpart (high, operands[1], operands[2]));
+ emit_insn (gen_<su>muldi3_highpart (high, operands[1], operands[2]));
emit_move_insn (gen_lowpart (DImode, operands[0]), low);
emit_move_insn (gen_highpart (DImode, operands[0]), high);
DONE;
})
-(define_insn "<u>muldi3_highpart"
+(define_insn "<su>muldi3_highpart"
[(set (match_operand:DI 0 "register_operand" "=r")
(truncate:DI
(lshiftrt:TI
@@ -961,13 +961,13 @@
{
rtx temp = gen_reg_rtx (SImode);
emit_insn (gen_mulsi3 (temp, operands[1], operands[2]));
- emit_insn (gen_<u>mulsi3_highpart (riscv_subword (operands[0], true),
+ emit_insn (gen_<su>mulsi3_highpart (riscv_subword (operands[0], true),
operands[1], operands[2]));
emit_insn (gen_movsi (riscv_subword (operands[0], false), temp));
DONE;
})
-(define_insn "<u>mulsi3_highpart"
+(define_insn "<su>mulsi3_highpart"
[(set (match_operand:SI 0 "register_operand" "=r")
(truncate:SI
(lshiftrt:DI
diff --git a/gcc/config/rs6000/darwin.h b/gcc/config/rs6000/darwin.h
index 6abf8e8..120b01f 100644
--- a/gcc/config/rs6000/darwin.h
+++ b/gcc/config/rs6000/darwin.h
@@ -203,7 +203,7 @@
/* Make both r2 and r13 available for allocation. */
#define FIXED_R2 0
-#define FIXED_R13 0
+#define FIXED_R13 TARGET_64BIT
/* Base register for access to local variables of the function. */
@@ -213,6 +213,9 @@
#undef RS6000_PIC_OFFSET_TABLE_REGNUM
#define RS6000_PIC_OFFSET_TABLE_REGNUM 31
+#undef FIRST_SAVED_GP_REGNO
+#define FIRST_SAVED_GP_REGNO 13
+
/* Darwin's stack must remain 16-byte aligned for both 32 and 64 bit
ABIs. */
diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c
index 7d48548..2eceb2c7 100644
--- a/gcc/config/rs6000/rs6000-call.c
+++ b/gcc/config/rs6000/rs6000-call.c
@@ -6223,11 +6223,19 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = {
or vector type. If a non-floating point or vector type is found, or
if a floating point or vector type that doesn't match a non-VOIDmode
*MODEP is found, then return -1, otherwise return the count in the
- sub-tree. */
+ sub-tree.
+
+ There have been some ABI snafus along the way with C++. Modify
+ EMPTY_BASE_SEEN to a nonzero value iff a C++ empty base class makes
+ an appearance; separate flag bits indicate whether or not such a
+ field is marked "no unique address". Modify ZERO_WIDTH_BF_SEEN
+ to 1 iff a C++ zero-length bitfield makes an appearance, but
+ in this case otherwise treat this as still being a homogeneous
+ aggregate. */
static int
rs6000_aggregate_candidate (const_tree type, machine_mode *modep,
- int *empty_base_seen)
+ int *empty_base_seen, int *zero_width_bf_seen)
{
machine_mode mode;
HOST_WIDE_INT size;
@@ -6298,7 +6306,8 @@ rs6000_aggregate_candidate (const_tree type, machine_mode *modep,
return -1;
count = rs6000_aggregate_candidate (TREE_TYPE (type), modep,
- empty_base_seen);
+ empty_base_seen,
+ zero_width_bf_seen);
if (count == -1
|| !index
|| !TYPE_MAX_VALUE (index)
@@ -6336,6 +6345,26 @@ rs6000_aggregate_candidate (const_tree type, machine_mode *modep,
if (TREE_CODE (field) != FIELD_DECL)
continue;
+ if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
+ {
+ /* GCC 11 and earlier generated incorrect code in a rare
+ corner case for C++. When a RECORD_TYPE looks like a
+ homogeneous aggregate, except that it also contains
+ one or more zero-width bit fields, these earlier
+ compilers would incorrectly pass the fields in FPRs
+ or VSRs. This occurred because the front end wrongly
+ removed these bitfields from the RECORD_TYPE. In
+ GCC 12 and later, the front end flaw was corrected.
+ We want to diagnose this case. To do this, we pretend
+ that we don't see the zero-width bit fields (hence
+ the continue statement here), but pass back a flag
+ indicating what happened. The caller then diagnoses
+ the issue and rejects the RECORD_TYPE as a homogeneous
+ aggregate. */
+ *zero_width_bf_seen = 1;
+ continue;
+ }
+
if (DECL_FIELD_ABI_IGNORED (field))
{
if (lookup_attribute ("no_unique_address",
@@ -6347,7 +6376,8 @@ rs6000_aggregate_candidate (const_tree type, machine_mode *modep,
}
sub_count = rs6000_aggregate_candidate (TREE_TYPE (field), modep,
- empty_base_seen);
+ empty_base_seen,
+ zero_width_bf_seen);
if (sub_count < 0)
return -1;
count += sub_count;
@@ -6381,7 +6411,8 @@ rs6000_aggregate_candidate (const_tree type, machine_mode *modep,
continue;
sub_count = rs6000_aggregate_candidate (TREE_TYPE (field), modep,
- empty_base_seen);
+ empty_base_seen,
+ zero_width_bf_seen);
if (sub_count < 0)
return -1;
count = count > sub_count ? count : sub_count;
@@ -6423,8 +6454,10 @@ rs6000_discover_homogeneous_aggregate (machine_mode mode, const_tree type,
{
machine_mode field_mode = VOIDmode;
int empty_base_seen = 0;
+ int zero_width_bf_seen = 0;
int field_count = rs6000_aggregate_candidate (type, &field_mode,
- &empty_base_seen);
+ &empty_base_seen,
+ &zero_width_bf_seen);
if (field_count > 0)
{
@@ -6460,6 +6493,25 @@ rs6000_discover_homogeneous_aggregate (machine_mode mode, const_tree type,
last_reported_type_uid = uid;
}
}
+ if (zero_width_bf_seen && warn_psabi)
+ {
+ static unsigned last_reported_type_uid;
+ unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
+ if (uid != last_reported_type_uid)
+ {
+ inform (input_location,
+ "ELFv2 parameter passing for an argument "
+ "containing zero-width bit fields but that is "
+ "otherwise a homogeneous aggregate was "
+ "corrected in GCC 12");
+ last_reported_type_uid = uid;
+ }
+ if (elt_mode)
+ *elt_mode = mode;
+ if (n_elts)
+ *n_elts = 1;
+ return false;
+ }
return true;
}
}
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 060f51a..ad86072 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -5289,9 +5289,6 @@ struct rs6000_cost_data
static void
rs6000_density_test (rs6000_cost_data *data)
{
- const int DENSITY_PCT_THRESHOLD = 85;
- const int DENSITY_SIZE_THRESHOLD = 70;
- const int DENSITY_PENALTY = 10;
struct loop *loop = data->loop_info;
basic_block *bbs = get_loop_body (loop);
int nbbs = loop->num_nodes;
@@ -5327,26 +5324,21 @@ rs6000_density_test (rs6000_cost_data *data)
free (bbs);
density_pct = (vec_cost * 100) / (vec_cost + not_vec_cost);
- if (density_pct > DENSITY_PCT_THRESHOLD
- && vec_cost + not_vec_cost > DENSITY_SIZE_THRESHOLD)
+ if (density_pct > rs6000_density_pct_threshold
+ && vec_cost + not_vec_cost > rs6000_density_size_threshold)
{
- data->cost[vect_body] = vec_cost * (100 + DENSITY_PENALTY) / 100;
+ data->cost[vect_body] = vec_cost * (100 + rs6000_density_penalty) / 100;
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"density %d%%, cost %d exceeds threshold, penalizing "
- "loop body cost by %d%%\n", density_pct,
- vec_cost + not_vec_cost, DENSITY_PENALTY);
+ "loop body cost by %u%%\n", density_pct,
+ vec_cost + not_vec_cost, rs6000_density_penalty);
}
/* Check whether we need to penalize the body cost to account
for excess strided or elementwise loads. */
if (data->extra_ctor_cost > 0)
{
- /* Threshold for load stmts percentage in all vectorized stmts. */
- const int DENSITY_LOAD_PCT_THRESHOLD = 45;
- /* Threshold for total number of load stmts. */
- const int DENSITY_LOAD_NUM_THRESHOLD = 20;
-
gcc_assert (data->nloads <= data->nstmts);
unsigned int load_pct = (data->nloads * 100) / data->nstmts;
@@ -5360,8 +5352,8 @@ rs6000_density_test (rs6000_cost_data *data)
the loads.
One typical case is the innermost loop of the hotspot of SPEC2017
503.bwaves_r without loop interchange. */
- if (data->nloads > DENSITY_LOAD_NUM_THRESHOLD
- && load_pct > DENSITY_LOAD_PCT_THRESHOLD)
+ if (data->nloads > (unsigned int) rs6000_density_load_num_threshold
+ && load_pct > (unsigned int) rs6000_density_load_pct_threshold)
{
data->cost[vect_body] += data->extra_ctor_cost;
if (dump_enabled_p ())
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index c1cb9ab..9d7878f 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -639,3 +639,41 @@ Enable instructions that guard against return-oriented programming attacks.
mprivileged
Target Var(rs6000_privileged) Init(0)
Generate code that will run in privileged state.
+
+-param=rs6000-density-pct-threshold=
+Target Undocumented Joined UInteger Var(rs6000_density_pct_threshold) Init(85) IntegerRange(0, 100) Param
+When costing for loop vectorization, we probably need to penalize the loop body
+cost if the existing cost model may not adequately reflect delays from
+unavailable vector resources. We collect the cost for vectorized statements
+and non-vectorized statements separately, check the proportion of vec_cost to
+total cost of vec_cost and non vec_cost, and penalize only if the proportion
+exceeds the threshold specified by this parameter. The default value is 85.
+
+-param=rs6000-density-size-threshold=
+Target Undocumented Joined UInteger Var(rs6000_density_size_threshold) Init(70) IntegerRange(0, 1000) Param
+Like parameter rs6000-density-pct-threshold, we also check the total sum of
+vec_cost and non vec_cost, and penalize only if the sum exceeds the threshold
+specified by this parameter. The default value is 70.
+
+-param=rs6000-density-penalty=
+Target Undocumented Joined UInteger Var(rs6000_density_penalty) Init(10) IntegerRange(0, 1000) Param
+When both heuristics with rs6000-density-pct-threshold and
+rs6000-density-size-threshold are satisfied, we decide to penalize the loop
+body cost by the value which is specified by this parameter. The default
+value is 10.
+
+-param=rs6000-density-load-pct-threshold=
+Target Undocumented Joined UInteger Var(rs6000_density_load_pct_threshold) Init(45) IntegerRange(0, 100) Param
+When costing for loop vectorization, we probably need to penalize the loop body
+cost by accounting for excess strided or elementwise loads. We collect the
+numbers for general statements and load statements according to the information
+for statements to be vectorized, check the proportion of load statements, and
+penalize only if the proportion exceeds the threshold specified by this
+parameter. The default value is 45.
+
+-param=rs6000-density-load-num-threshold=
+Target Undocumented Joined UInteger Var(rs6000_density_load_num_threshold) Init(20) IntegerRange(0, 1000) Param
+Like parameter rs6000-density-load-pct-threshold, we also check if the total
+number of load statements exceeds the threshold specified by this parameter,
+and penalize only if it's satisfied. The default value is 20.
+
diff --git a/gcc/config/rs6000/vxworks.h b/gcc/config/rs6000/vxworks.h
index 5facbbb..d8ecc02 100644
--- a/gcc/config/rs6000/vxworks.h
+++ b/gcc/config/rs6000/vxworks.h
@@ -147,10 +147,6 @@ along with GCC; see the file COPYING3. If not see
#undef FUNCTION_PROFILER
#define FUNCTION_PROFILER(FILE,LABELNO) VXWORKS_FUNCTION_PROFILER(FILE,LABELNO)
-/* Initialize library function table. */
-#undef TARGET_INIT_LIBFUNCS
-#define TARGET_INIT_LIBFUNCS rs6000_vxworks_init_libfuncs
-
/* Nor sdata, for kernel mode. We use this in
SUBSUBTARGET_INITIALIZE_OPTIONS, after rs6000_rtp has been initialized. */
#undef SDATA_DEFAULT_SIZE
diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index 54dd633..e043854 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -6414,6 +6414,15 @@ s390_expand_insv (rtx dest, rtx op1, rtx op2, rtx src)
if (bitsize + bitpos > GET_MODE_BITSIZE (mode))
return false;
+ /* Just a move. */
+ if (bitpos == 0
+ && bitsize == GET_MODE_BITSIZE (GET_MODE (src))
+ && mode == GET_MODE (src))
+ {
+ emit_move_insn (dest, src);
+ return true;
+ }
+
/* Generate INSERT IMMEDIATE (IILL et al). */
/* (set (ze (reg)) (const_int)). */
if (TARGET_ZARCH
@@ -6510,6 +6519,7 @@ s390_expand_insv (rtx dest, rtx op1, rtx op2, rtx src)
&& (bitpos & 32) == ((bitpos + bitsize - 1) & 32)
&& MEM_P (src)
&& (mode == DImode || mode == SImode)
+ && mode != smode
&& register_operand (dest, mode))
{
/* Emit a strict_low_part pattern if possible. */
diff --git a/gcc/config/s390/tpf.md b/gcc/config/s390/tpf.md
index 297e9d1..35b3719 100644
--- a/gcc/config/s390/tpf.md
+++ b/gcc/config/s390/tpf.md
@@ -21,7 +21,8 @@
[(unspec_volatile [(match_operand 0 "const_int_operand" "J")
(match_operand 1 "const_int_operand" "J")]
UNSPECV_TPF_PROLOGUE)
- (clobber (reg:DI 1))]
+ (clobber (reg:DI 1))
+ (clobber (reg:CC CC_REGNUM))]
"TARGET_TPF_PROFILING"
"larl\t%%r1,.+14\;tm\t%0,255\;bnz\t%1"
[(set_attr "length" "14")])
@@ -31,7 +32,8 @@
[(unspec_volatile [(match_operand 0 "const_int_operand" "J")
(match_operand 1 "const_int_operand" "J")]
UNSPECV_TPF_EPILOGUE)
- (clobber (reg:DI 1))]
+ (clobber (reg:DI 1))
+ (clobber (reg:CC CC_REGNUM))]
"TARGET_TPF_PROFILING"
"larl\t%%r1,.+14\;tm\t%0,255\;bnz\t%1"
[(set_attr "length" "14")])