diff options
author | Ian Lance Taylor <iant@golang.org> | 2021-10-07 15:28:36 -0700 |
---|---|---|
committer | Ian Lance Taylor <iant@golang.org> | 2021-10-07 15:28:36 -0700 |
commit | 0b6b70a0733672600644c8df96942cda5bf86d3d (patch) | |
tree | 9a1fbd7f782c54df55ab225ed1be057e3f3b0b8a /gcc/config | |
parent | a5b5cabc91c38710adbe5c8a2b53882abe994441 (diff) | |
parent | fba228e259dd5112851527f2dbb62c5601100985 (diff) | |
download | gcc-0b6b70a0733672600644c8df96942cda5bf86d3d.zip gcc-0b6b70a0733672600644c8df96942cda5bf86d3d.tar.gz gcc-0b6b70a0733672600644c8df96942cda5bf86d3d.tar.bz2 |
Merge from trunk revision fba228e259dd5112851527f2dbb62c5601100985.
Diffstat (limited to 'gcc/config')
46 files changed, 3225 insertions, 314 deletions
diff --git a/gcc/config/aarch64/aarch64-arches.def b/gcc/config/aarch64/aarch64-arches.def index b749727..a3b32e0 100644 --- a/gcc/config/aarch64/aarch64-arches.def +++ b/gcc/config/aarch64/aarch64-arches.def @@ -37,6 +37,8 @@ AARCH64_ARCH("armv8.3-a", generic, 8_3A, 8, AARCH64_FL_FOR_ARCH8_3) AARCH64_ARCH("armv8.4-a", generic, 8_4A, 8, AARCH64_FL_FOR_ARCH8_4) AARCH64_ARCH("armv8.5-a", generic, 8_5A, 8, AARCH64_FL_FOR_ARCH8_5) AARCH64_ARCH("armv8.6-a", generic, 8_6A, 8, AARCH64_FL_FOR_ARCH8_6) +AARCH64_ARCH("armv8.7-a", generic, 8_7A, 8, AARCH64_FL_FOR_ARCH8_7) AARCH64_ARCH("armv8-r", generic, 8R , 8, AARCH64_FL_FOR_ARCH8_R) +AARCH64_ARCH("armv9-a", generic, 9A , 9, AARCH64_FL_FOR_ARCH9) #undef AARCH64_ARCH diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index 119f67d..1a507ea 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -182,6 +182,10 @@ static enum aarch64_type_qualifiers aarch64_types_binopp_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_poly, qualifier_poly, qualifier_poly }; #define TYPES_BINOPP (aarch64_types_binopp_qualifiers) +static enum aarch64_type_qualifiers +aarch64_types_binop_ppu_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_poly, qualifier_poly, qualifier_unsigned }; +#define TYPES_BINOP_PPU (aarch64_types_binop_ppu_qualifiers) static enum aarch64_type_qualifiers aarch64_types_ternop_qualifiers[SIMD_MAX_BUILTIN_ARGS] @@ -207,6 +211,10 @@ aarch64_types_ternopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS] qualifier_unsigned, qualifier_immediate }; #define TYPES_TERNOPUI (aarch64_types_ternopu_imm_qualifiers) static enum aarch64_type_qualifiers +aarch64_types_ternop_sssu_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_none, qualifier_none, qualifier_none, qualifier_unsigned }; +#define TYPES_TERNOP_SSSU (aarch64_types_ternop_sssu_qualifiers) +static enum aarch64_type_qualifiers aarch64_types_ternop_ssus_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_none }; #define TYPES_TERNOP_SSUS (aarch64_types_ternop_ssus_qualifiers) @@ -214,6 +222,10 @@ static enum aarch64_type_qualifiers aarch64_types_ternop_suss_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_none, qualifier_unsigned, qualifier_none, qualifier_none }; #define TYPES_TERNOP_SUSS (aarch64_types_ternop_suss_qualifiers) +static enum aarch64_type_qualifiers +aarch64_types_binop_pppu_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_poly, qualifier_poly, qualifier_poly, qualifier_unsigned }; +#define TYPES_TERNOP_PPPU (aarch64_types_binop_pppu_qualifiers) static enum aarch64_type_qualifiers diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def index b2aa167..77da310 100644 --- a/gcc/config/aarch64/aarch64-cores.def +++ b/gcc/config/aarch64/aarch64-cores.def @@ -162,4 +162,13 @@ AARCH64_CORE("cortex-a76.cortex-a55", cortexa76cortexa55, cortexa53, 8_2A, AAR /* Armv8-R Architecture Processors. */ AARCH64_CORE("cortex-r82", cortexr82, cortexa53, 8R, AARCH64_FL_FOR_ARCH8_R, cortexa53, 0x41, 0xd15, -1) +/* Armv9.0-A Architecture Processors. */ + +/* Arm ('A') cores. */ +AARCH64_CORE("cortex-a510", cortexa510, cortexa55, 9A, AARCH64_FL_FOR_ARCH9 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_MEMTAG | AARCH64_FL_I8MM | AARCH64_FL_BF16, cortexa53, 0x41, 0xd46, -1) + +AARCH64_CORE("cortex-a710", cortexa710, cortexa57, 9A, AARCH64_FL_FOR_ARCH9 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_MEMTAG | AARCH64_FL_I8MM | AARCH64_FL_BF16, neoversen2, 0x41, 0xd47, -1) + +AARCH64_CORE("cortex-x2", cortexx2, cortexa57, 9A, AARCH64_FL_FOR_ARCH9 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_MEMTAG | AARCH64_FL_I8MM | AARCH64_FL_BF16, neoversen2, 0x41, 0xd48, -1) + #undef AARCH64_CORE diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def index 579328c..b61f1df 100644 --- a/gcc/config/aarch64/aarch64-option-extensions.def +++ b/gcc/config/aarch64/aarch64-option-extensions.def @@ -232,4 +232,7 @@ AARCH64_OPT_EXTENSION("flagm", AARCH64_FL_FLAGM, 0, 0, false, "flagm") /* Enabling/Disabling "pauth" only changes "pauth". */ AARCH64_OPT_EXTENSION("pauth", AARCH64_FL_PAUTH, 0, 0, false, "paca pacg") +/* Enabling/Disabling "ls64" only changes "ls64". */ +AARCH64_OPT_EXTENSION("ls64", AARCH64_FL_LS64, 0, 0, false, "") + #undef AARCH64_OPT_EXTENSION diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 402453a..35dc075 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -721,6 +721,8 @@ /* Implemented by aarch64_qtbl1<mode>. */ VAR2 (BINOP, qtbl1, 0, NONE, v8qi, v16qi) VAR2 (BINOPU, qtbl1, 0, NONE, v8qi, v16qi) + VAR2 (BINOP_PPU, qtbl1, 0, NONE, v8qi, v16qi) + VAR2 (BINOP_SSU, qtbl1, 0, NONE, v8qi, v16qi) /* Implemented by aarch64_qtbl2<mode>. */ VAR2 (BINOP, qtbl2, 0, NONE, v8qi, v16qi) @@ -734,6 +736,8 @@ /* Implemented by aarch64_qtbx1<mode>. */ VAR2 (TERNOP, qtbx1, 0, NONE, v8qi, v16qi) VAR2 (TERNOPU, qtbx1, 0, NONE, v8qi, v16qi) + VAR2 (TERNOP_PPPU, qtbx1, 0, NONE, v8qi, v16qi) + VAR2 (TERNOP_SSSU, qtbx1, 0, NONE, v8qi, v16qi) /* Implemented by aarch64_qtbx2<mode>. */ VAR2 (TERNOP, qtbx2, 0, NONE, v8qi, v16qi) diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md index e491c29..12be913 100644 --- a/gcc/config/aarch64/aarch64-tune.md +++ b/gcc/config/aarch64/aarch64-tune.md @@ -1,5 +1,5 @@ ;; -*- buffer-read-only: t -*- ;; Generated automatically by gentune.sh from aarch64-cores.def (define_attr "tune" - "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,neoversen2,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82" + "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,neoversen2,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexx2" (const (symbol_ref "((enum attr_tune) aarch64_tune)"))) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 36519cc..a9a1800 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -23390,7 +23390,8 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst, } /* Expand cpymem, as if from a __builtin_memcpy. Return true if - we succeed, otherwise return false. */ + we succeed, otherwise return false, indicating that a libcall to + memcpy should be emitted. */ bool aarch64_expand_cpymem (rtx *operands) @@ -23407,11 +23408,13 @@ aarch64_expand_cpymem (rtx *operands) unsigned HOST_WIDE_INT size = INTVAL (operands[2]); - /* Inline up to 256 bytes when optimizing for speed. */ + /* Try to inline up to 256 bytes. */ unsigned HOST_WIDE_INT max_copy_size = 256; - if (optimize_function_for_size_p (cfun)) - max_copy_size = 128; + bool size_p = optimize_function_for_size_p (cfun); + + if (size > max_copy_size) + return false; int copy_bits = 256; @@ -23421,13 +23424,14 @@ aarch64_expand_cpymem (rtx *operands) || !TARGET_SIMD || (aarch64_tune_params.extra_tuning_flags & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)) - { - copy_bits = 128; - max_copy_size = max_copy_size / 2; - } + copy_bits = 128; - if (size > max_copy_size) - return false; + /* Emit an inline load+store sequence and count the number of operations + involved. We use a simple count of just the loads and stores emitted + rather than rtx_insn count as all the pointer adjustments and reg copying + in this function will get optimized away later in the pipeline. */ + start_sequence (); + unsigned nops = 0; base = copy_to_mode_reg (Pmode, XEXP (dst, 0)); dst = adjust_automodify_address (dst, VOIDmode, base, 0); @@ -23456,7 +23460,8 @@ aarch64_expand_cpymem (rtx *operands) cur_mode = V4SImode; aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode); - + /* A single block copy is 1 load + 1 store. */ + nops += 2; n -= mode_bits; /* Emit trailing copies using overlapping unaligned accesses - this is @@ -23471,7 +23476,16 @@ aarch64_expand_cpymem (rtx *operands) n = n_bits; } } + rtx_insn *seq = get_insns (); + end_sequence (); + + /* A memcpy libcall in the worst case takes 3 instructions to prepare the + arguments + 1 for the call. */ + unsigned libcall_cost = 4; + if (size_p && libcall_cost < nops) + return false; + emit_insn (seq); return true; } @@ -23534,40 +23548,37 @@ aarch64_expand_setmem (rtx *operands) if (!CONST_INT_P (operands[1])) return false; - bool speed_p = !optimize_function_for_size_p (cfun); + bool size_p = optimize_function_for_size_p (cfun); /* Default the maximum to 256-bytes. */ unsigned max_set_size = 256; - /* In case we are optimizing for size or if the core does not - want to use STP Q regs, lower the max_set_size. */ - max_set_size = (!speed_p - || (aarch64_tune_params.extra_tuning_flags - & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)) - ? max_set_size / 2 : max_set_size; - len = INTVAL (operands[1]); /* Upper bound check. */ if (len > max_set_size) return false; + /* Attempt a sequence with a vector broadcast followed by stores. + Count the number of operations involved to see if it's worth it for + code size. */ + start_sequence (); + unsigned nops = 0; base = copy_to_mode_reg (Pmode, XEXP (dst, 0)); dst = adjust_automodify_address (dst, VOIDmode, base, 0); /* Prepare the val using a DUP/MOVI v0.16B, val. */ src = expand_vector_broadcast (V16QImode, val); src = force_reg (V16QImode, src); - + nops++; /* Convert len to bits to make the rest of the code simpler. */ n = len * BITS_PER_UNIT; /* Maximum amount to copy in one go. We allow 256-bit chunks based on the AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. setmem expand pattern is only turned on for TARGET_SIMD. */ - const int copy_limit = (speed_p - && (aarch64_tune_params.extra_tuning_flags - & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)) + const int copy_limit = (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) ? GET_MODE_BITSIZE (TImode) : 256; while (n > 0) @@ -23583,7 +23594,7 @@ aarch64_expand_setmem (rtx *operands) mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant (); aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode); - + nops++; n -= mode_bits; /* Do certain trailing copies as overlapping if it's going to be @@ -23599,7 +23610,15 @@ aarch64_expand_setmem (rtx *operands) n = n_bits; } } + rtx_insn *seq = get_insns (); + end_sequence (); + /* A call to memset in the worst case requires 3 instructions to prepare + the arguments + 1 for the call. Prefer the inline sequence for size if + it is no longer than that. */ + if (size_p && nops > 4) + return false; + emit_insn (seq); return true; } diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index a5ba6c2..2792bb2 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -231,6 +231,15 @@ extern unsigned aarch64_architecture_version; /* Pointer Authentication (PAUTH) extension. */ #define AARCH64_FL_PAUTH (1ULL << 40) +/* 64-byte atomic load/store extensions. */ +#define AARCH64_FL_LS64 (1ULL << 41) + +/* Armv8.7-a architecture extensions. */ +#define AARCH64_FL_V8_7 (1ULL << 42) + +/* Armv9.0-A. */ +#define AARCH64_FL_V9 (1ULL << 43) /* Armv9.0-A Architecture. */ + /* Has FP and SIMD. */ #define AARCH64_FL_FPSIMD (AARCH64_FL_FP | AARCH64_FL_SIMD) @@ -255,8 +264,13 @@ extern unsigned aarch64_architecture_version; #define AARCH64_FL_FOR_ARCH8_6 \ (AARCH64_FL_FOR_ARCH8_5 | AARCH64_FL_V8_6 | AARCH64_FL_FPSIMD \ | AARCH64_FL_I8MM | AARCH64_FL_BF16) +#define AARCH64_FL_FOR_ARCH8_7 \ + (AARCH64_FL_FOR_ARCH8_6 | AARCH64_FL_V8_7 | AARCH64_FL_LS64) + #define AARCH64_FL_FOR_ARCH8_R \ (AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_V8_R) +#define AARCH64_FL_FOR_ARCH9 \ + (AARCH64_FL_FOR_ARCH8_5 | AARCH64_FL_SVE | AARCH64_FL_SVE2 | AARCH64_FL_V9) /* Macros to test ISA flags. */ @@ -295,6 +309,7 @@ extern unsigned aarch64_architecture_version; #define AARCH64_ISA_SB (aarch64_isa_flags & AARCH64_FL_SB) #define AARCH64_ISA_V8_R (aarch64_isa_flags & AARCH64_FL_V8_R) #define AARCH64_ISA_PAUTH (aarch64_isa_flags & AARCH64_FL_PAUTH) +#define AARCH64_ISA_V9 (aarch64_isa_flags & AARCH64_FL_V9) /* Crypto is an optional extension to AdvSIMD. */ #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO) diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 635a223..2d5bf34 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -10416,15 +10416,14 @@ __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl1_p8 (poly8x16_t __tab, uint8x8_t __idx) { - return (poly8x8_t) __builtin_aarch64_qtbl1v8qi ((int8x16_t) __tab, - (int8x8_t) __idx); + return __builtin_aarch64_qtbl1v8qi_ppu (__tab, __idx); } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl1_s8 (int8x16_t __tab, uint8x8_t __idx) { - return __builtin_aarch64_qtbl1v8qi (__tab, (int8x8_t) __idx); + return __builtin_aarch64_qtbl1v8qi_ssu (__tab, __idx); } __extension__ extern __inline uint8x8_t @@ -10438,15 +10437,14 @@ __extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl1q_p8 (poly8x16_t __tab, uint8x16_t __idx) { - return (poly8x16_t) __builtin_aarch64_qtbl1v16qi ((int8x16_t) __tab, - (int8x16_t) __idx); + return __builtin_aarch64_qtbl1v16qi_ppu (__tab, __idx); } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbl1q_s8 (int8x16_t __tab, uint8x16_t __idx) { - return __builtin_aarch64_qtbl1v16qi (__tab, (int8x16_t) __idx); + return __builtin_aarch64_qtbl1v16qi_ssu (__tab, __idx); } __extension__ extern __inline uint8x16_t @@ -10460,7 +10458,7 @@ __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx1_s8 (int8x8_t __r, int8x16_t __tab, uint8x8_t __idx) { - return __builtin_aarch64_qtbx1v8qi (__r, __tab, (int8x8_t) __idx); + return __builtin_aarch64_qtbx1v8qi_sssu (__r, __tab, __idx); } __extension__ extern __inline uint8x8_t @@ -10474,16 +10472,14 @@ __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx1_p8 (poly8x8_t __r, poly8x16_t __tab, uint8x8_t __idx) { - return (poly8x8_t) __builtin_aarch64_qtbx1v8qi ((int8x8_t) __r, - (int8x16_t) __tab, - (int8x8_t) __idx); + return __builtin_aarch64_qtbx1v8qi_pppu (__r, __tab, __idx); } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx1q_s8 (int8x16_t __r, int8x16_t __tab, uint8x16_t __idx) { - return __builtin_aarch64_qtbx1v16qi (__r, __tab, (int8x16_t) __idx); + return __builtin_aarch64_qtbx1v16qi_sssu (__r, __tab, __idx); } __extension__ extern __inline uint8x16_t @@ -10497,9 +10493,7 @@ __extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx1q_p8 (poly8x16_t __r, poly8x16_t __tab, uint8x16_t __idx) { - return (poly8x16_t) __builtin_aarch64_qtbx1v16qi ((int8x16_t) __r, - (int8x16_t) __tab, - (int8x16_t) __idx); + return __builtin_aarch64_qtbx1v16qi_pppu (__r, __tab, __idx); } /* V7 legacy table intrinsics. */ @@ -10528,8 +10522,7 @@ vtbl1_p8 (poly8x8_t __tab, uint8x8_t __idx) { poly8x16_t __temp = vcombine_p8 (__tab, vcreate_p8 (__AARCH64_UINT64_C (0x0))); - return (poly8x8_t) __builtin_aarch64_qtbl1v8qi ((int8x16_t) __temp, - (int8x8_t) __idx); + return __builtin_aarch64_qtbl1v8qi_ppu (__temp, __idx); } __extension__ extern __inline int8x8_t @@ -10553,8 +10546,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbl2_p8 (poly8x8x2_t __tab, uint8x8_t __idx) { poly8x16_t __temp = vcombine_p8 (__tab.val[0], __tab.val[1]); - return (poly8x8_t) __builtin_aarch64_qtbl1v8qi ((int8x16_t) __temp, - (int8x8_t) __idx); + return __builtin_aarch64_qtbl1v8qi_ppu (__temp, __idx); } __extension__ extern __inline int8x8_t @@ -10653,9 +10645,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtbx2_p8 (poly8x8_t __r, poly8x8x2_t __tab, uint8x8_t __idx) { poly8x16_t __temp = vcombine_p8 (__tab.val[0], __tab.val[1]); - return (poly8x8_t) __builtin_aarch64_qtbx1v8qi ((int8x8_t) __r, - (int8x16_t) __temp, - (int8x8_t) __idx); + return __builtin_aarch64_qtbx1v8qi_pppu (__r, __temp, __idx); } /* End of temporary inline asm. */ diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in index bcc9ebe..d0d0d0f 100644 --- a/gcc/config/arm/arm-cpus.in +++ b/gcc/config/arm/arm-cpus.in @@ -1612,6 +1612,16 @@ begin cpu cortex-r52 part d13 end cpu cortex-r52 +begin cpu cortex-r52plus + cname cortexr52plus + tune flags LDSCHED + architecture armv8-r+crc+simd + option nofp.dp remove FP_DBL ALL_SIMD + costs cortex + vendor 41 + part d16 +end cpu cortex-r52plus + # FPU entries # format: # begin fpu <name> diff --git a/gcc/config/arm/arm-tables.opt b/gcc/config/arm/arm-tables.opt index 5692d4f..8bb0c9f 100644 --- a/gcc/config/arm/arm-tables.opt +++ b/gcc/config/arm/arm-tables.opt @@ -282,6 +282,9 @@ Enum(processor_type) String(cortex-m55) Value( TARGET_CPU_cortexm55) EnumValue Enum(processor_type) String(cortex-r52) Value( TARGET_CPU_cortexr52) +EnumValue +Enum(processor_type) String(cortex-r52plus) Value( TARGET_CPU_cortexr52plus) + Enum Name(arm_arch) Type(int) Known ARM architectures (for use with the -march= option): diff --git a/gcc/config/arm/arm-tune.md b/gcc/config/arm/arm-tune.md index b9df864..6482833 100644 --- a/gcc/config/arm/arm-tune.md +++ b/gcc/config/arm/arm-tune.md @@ -49,5 +49,5 @@ cortexx1,neoversen1,cortexa75cortexa55, cortexa76cortexa55,neoversev1,neoversen2, cortexm23,cortexm33,cortexm35p, - cortexm55,cortexr52" + cortexm55,cortexr52,cortexr52plus" (const (symbol_ref "((enum attr_tune) arm_tune)"))) diff --git a/gcc/config/darwin.h b/gcc/config/darwin.h index 50524a5..0fa1c57 100644 --- a/gcc/config/darwin.h +++ b/gcc/config/darwin.h @@ -251,7 +251,7 @@ extern GTY(()) int darwin_ms_struct; %{v} \ %{g*:%{!gctf:%{!gbtf:%{!gstabs*:%{%:debug-level-gt(0): -idsym}}}}}\ %{.c|.cc|.C|.cpp|.cp|.c++|.cxx|.CPP|.m|.mm|.s|.f|.f90|\ - .f95|.f03|.f77|.for|.F|.F90|.F95|.F03: \ + .f95|.f03|.f77|.for|.F|.F90|.F95|.F03|.d: \ %{g*:%{!gctf:%{!gbtf:%{!gstabs*:%{%:debug-level-gt(0): -dsym}}}}}}}}}}}}}" #define LINK_COMMAND_SPEC LINK_COMMAND_SPEC_A DSYMUTIL_SPEC diff --git a/gcc/config/gcn/gcn-hsa.h b/gcc/config/gcn/gcn-hsa.h index fc99c8d..6a432d1 100644 --- a/gcc/config/gcn/gcn-hsa.h +++ b/gcc/config/gcn/gcn-hsa.h @@ -75,25 +75,66 @@ extern unsigned int gcn_local_sym_hash (const char *name); supported for gcn. */ #define GOMP_SELF_SPECS "" +#ifdef HAVE_GCN_XNACK_FIJI +#define X_FIJI +#else +#define X_FIJI "!march=*:;march=fiji:;" +#endif +#ifdef HAVE_GCN_XNACK_GFX900 +#define X_900 +#else +#define X_900 "march=gfx900:;" +#endif +#ifdef HAVE_GCN_XNACK_GFX906 +#define X_906 +#else +#define X_906 "march=gfx906:;" +#endif +#ifdef HAVE_GCN_XNACK_GFX908 +#define X_908 +#else +#define X_908 "march=gfx908:;" +#endif + #ifdef HAVE_GCN_SRAM_ECC_FIJI -#define A_FIJI +#define S_FIJI #else -#define A_FIJI "!march=*:;march=fiji:;" +#define S_FIJI "!march=*:;march=fiji:;" #endif #ifdef HAVE_GCN_SRAM_ECC_GFX900 -#define A_900 +#define S_900 #else -#define A_900 "march=gfx900:;" +#define S_900 "march=gfx900:;" #endif #ifdef HAVE_GCN_SRAM_ECC_GFX906 -#define A_906 +#define S_906 #else -#define A_906 "march=gfx906:;" +#define S_906 "march=gfx906:;" #endif #ifdef HAVE_GCN_SRAM_ECC_GFX908 -#define A_908 +#define S_908 +#else +#define S_908 "march=gfx908:;" +#endif + +#ifdef HAVE_GCN_ASM_V3_SYNTAX +#define SRAMOPT "!msram-ecc=off:-mattr=+sram-ecc;:-mattr=-sram-ecc" +#endif +#ifdef HAVE_GCN_ASM_V4_SYNTAX +/* In HSACOv4 no attribute setting means the binary supports "any" hardware + configuration. The name of the attribute also changed. */ +#define SRAMOPT "msram-ecc=on:-mattr=+sramecc;msram-ecc=off:-mattr=-sramecc" +#endif +#if !defined(SRAMOPT) && !defined(IN_LIBGCC2) +#error "No assembler syntax configured" +#endif + +#ifdef HAVE_GCN_ASM_V4_SYNTAX +/* FIJI cards don't seem to support drivers new enough to allow HSACOv4. */ +#define HSACO3_SELECT_OPT \ + "%{!march=*|march=fiji:--amdhsa-code-object-version=3} " #else -#define A_908 "march=gfx908:;" +#define HSACO3_SELECT_OPT #endif /* These targets can't have SRAM-ECC, even if a broken assembler allows it. */ @@ -103,10 +144,10 @@ extern unsigned int gcn_local_sym_hash (const char *name); /* Use LLVM assembler and linker options. */ #define ASM_SPEC "-triple=amdgcn--amdhsa " \ "%:last_arg(%{march=*:-mcpu=%*}) " \ - "-mattr=%{mxnack:+xnack;:-xnack} " \ - /* FIXME: support "any" when we move to HSACOv4. */ \ - "-mattr=%{" A_FIJI A_900 A_906 A_908 \ - "!msram-ecc=off:+sram-ecc;:-sram-ecc} " \ + HSACO3_SELECT_OPT \ + "%{" X_FIJI X_900 X_906 X_908 \ + "mxnack:-mattr=+xnack;:-mattr=-xnack} " \ + "%{" S_FIJI S_900 S_906 S_908 SRAMOPT "} " \ "-filetype=obj" #define LINK_SPEC "--pie --export-dynamic" #define LIB_SPEC "-lc" diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md index 84ff675..01fdce6 100644 --- a/gcc/config/gcn/gcn-valu.md +++ b/gcc/config/gcn/gcn-valu.md @@ -827,8 +827,12 @@ /* Work around assembler bug in which a 64-bit register is expected, but a 32-bit value would be correct. */ int reg = REGNO (operands[2]) - FIRST_VGPR_REG; - sprintf (buf, "global_load%%o0\t%%0, v[%d:%d], %%1 offset:%%3%s\;" - "s_waitcnt\tvmcnt(0)", reg, reg + 1, glc); + if (HAVE_GCN_ASM_GLOBAL_LOAD_FIXED) + sprintf (buf, "global_load%%o0\t%%0, v%d, %%1 offset:%%3%s\;" + "s_waitcnt\tvmcnt(0)", reg, glc); + else + sprintf (buf, "global_load%%o0\t%%0, v[%d:%d], %%1 offset:%%3%s\;" + "s_waitcnt\tvmcnt(0)", reg, reg + 1, glc); } else gcc_unreachable (); @@ -958,8 +962,12 @@ /* Work around assembler bug in which a 64-bit register is expected, but a 32-bit value would be correct. */ int reg = REGNO (operands[1]) - FIRST_VGPR_REG; - sprintf (buf, "global_store%%s3\tv[%d:%d], %%3, %%0 offset:%%2%s", - reg, reg + 1, glc); + if (HAVE_GCN_ASM_GLOBAL_LOAD_FIXED) + sprintf (buf, "global_store%%s3\tv%d, %%3, %%0 offset:%%2%s", + reg, glc); + else + sprintf (buf, "global_store%%s3\tv[%d:%d], %%3, %%0 offset:%%2%s", + reg, reg + 1, glc); } else gcc_unreachable (); diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index 2a3fc96..2e90f32 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -5217,42 +5217,76 @@ static void output_file_start (void) { const char *cpu; - bool use_sram = flag_sram_ecc; + bool use_xnack_attr = true; + bool use_sram_attr = true; switch (gcn_arch) { case PROCESSOR_FIJI: cpu = "gfx803"; +#ifndef HAVE_GCN_XNACK_FIJI + use_xnack_attr = false; +#endif #ifndef HAVE_GCN_SRAM_ECC_FIJI - use_sram = false; + use_sram_attr = false; #endif break; case PROCESSOR_VEGA10: cpu = "gfx900"; +#ifndef HAVE_GCN_XNACK_GFX900 + use_xnack_attr = false; +#endif #ifndef HAVE_GCN_SRAM_ECC_GFX900 - use_sram = false; + use_sram_attr = false; #endif break; case PROCESSOR_VEGA20: cpu = "gfx906"; +#ifndef HAVE_GCN_XNACK_GFX906 + use_xnack_attr = false; +#endif #ifndef HAVE_GCN_SRAM_ECC_GFX906 - use_sram = false; + use_sram_attr = false; #endif break; case PROCESSOR_GFX908: cpu = "gfx908"; +#ifndef HAVE_GCN_XNACK_GFX908 + use_xnack_attr = false; +#endif #ifndef HAVE_GCN_SRAM_ECC_GFX908 - use_sram = false; + use_sram_attr = false; #endif break; default: gcc_unreachable (); } +#if HAVE_GCN_ASM_V3_SYNTAX const char *xnack = (flag_xnack ? "+xnack" : ""); - /* FIXME: support "any" when we move to HSACOv4. */ - const char *sram_ecc = (use_sram ? "+sram-ecc" : ""); + const char *sram_ecc = (flag_sram_ecc ? "+sram-ecc" : ""); +#endif +#if HAVE_GCN_ASM_V4_SYNTAX + /* In HSACOv4 no attribute setting means the binary supports "any" hardware + configuration. In GCC binaries, this is true for SRAM ECC, but not + XNACK. */ + const char *xnack = (flag_xnack ? ":xnack+" : ":xnack-"); + const char *sram_ecc = (flag_sram_ecc == SRAM_ECC_ON ? ":sramecc+" + : flag_sram_ecc == SRAM_ECC_OFF ? ":sramecc-" + : ""); +#endif + if (!use_xnack_attr) + xnack = ""; + if (!use_sram_attr) + sram_ecc = ""; fprintf(asm_out_file, "\t.amdgcn_target \"amdgcn-unknown-amdhsa--%s%s%s\"\n", - cpu, xnack, sram_ecc); + cpu, +#if HAVE_GCN_ASM_V3_SYNTAX + xnack, sram_ecc +#endif +#ifdef HAVE_GCN_ASM_V4_SYNTAX + sram_ecc, xnack +#endif + ); } /* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h. diff --git a/gcc/config/gcn/mkoffload.c b/gcc/config/gcn/mkoffload.c index 732bdfd..a3b22d0 100644 --- a/gcc/config/gcn/mkoffload.c +++ b/gcc/config/gcn/mkoffload.c @@ -54,8 +54,51 @@ #undef EF_AMDGPU_MACH_AMDGCN_GFX908 #define EF_AMDGPU_MACH_AMDGCN_GFX908 0x30 -#define EF_AMDGPU_XNACK 0x100 -#define EF_AMDGPU_SRAM_ECC 0x200 +#define EF_AMDGPU_XNACK_V3 0x100 +#define EF_AMDGPU_SRAM_ECC_V3 0x200 + +#define EF_AMDGPU_FEATURE_XNACK_V4 0x300 /* Mask. */ +#define EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4 0x000 +#define EF_AMDGPU_FEATURE_XNACK_ANY_V4 0x100 +#define EF_AMDGPU_FEATURE_XNACK_OFF_V4 0x200 +#define EF_AMDGPU_FEATURE_XNACK_ON_V4 0x300 + +#define EF_AMDGPU_FEATURE_SRAMECC_V4 0xc00 /* Mask. */ +#define EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4 0x000 +#define EF_AMDGPU_FEATURE_SRAMECC_ANY_V4 0x400 +#define EF_AMDGPU_FEATURE_SRAMECC_OFF_V4 0x800 +#define EF_AMDGPU_FEATURE_SRAMECC_ON_V4 0xc00 + +#ifdef HAVE_GCN_ASM_V3_SYNTAX +#define SET_XNACK_ON(VAR) VAR |= EF_AMDGPU_XNACK_V3 +#define SET_XNACK_OFF(VAR) VAR &= ~EF_AMDGPU_XNACK_V3 +#define TEST_XNACK(VAR) (VAR & EF_AMDGPU_XNACK_V3) + +#define SET_SRAM_ECC_ON(VAR) VAR |= EF_AMDGPU_SRAM_ECC_V3 +#define SET_SRAM_ECC_ANY(VAR) SET_SRAM_ECC_ON (VAR) +#define SET_SRAM_ECC_OFF(VAR) VAR &= ~EF_AMDGPU_SRAM_ECC_V3 +#define TEST_SRAM_ECC_ANY(VAR) 0 /* Not supported. */ +#define TEST_SRAM_ECC_ON(VAR) (VAR & EF_AMDGPU_SRAM_ECC_V3) +#endif +#ifdef HAVE_GCN_ASM_V4_SYNTAX +#define SET_XNACK_ON(VAR) VAR = ((VAR & ~EF_AMDGPU_FEATURE_XNACK_V4) \ + | EF_AMDGPU_FEATURE_XNACK_ON_V4) +#define SET_XNACK_OFF(VAR) VAR = ((VAR & ~EF_AMDGPU_FEATURE_XNACK_V4) \ + | EF_AMDGPU_FEATURE_XNACK_OFF_V4) +#define TEST_XNACK(VAR) ((VAR & EF_AMDGPU_FEATURE_XNACK_V4) \ + == EF_AMDGPU_FEATURE_XNACK_ON_V4) + +#define SET_SRAM_ECC_ON(VAR) VAR = ((VAR & ~EF_AMDGPU_FEATURE_SRAMECC_V4) \ + | EF_AMDGPU_FEATURE_SRAMECC_ON_V4) +#define SET_SRAM_ECC_ANY(VAR) VAR = ((VAR & ~EF_AMDGPU_FEATURE_SRAMECC_V4) \ + | EF_AMDGPU_FEATURE_SRAMECC_ANY_V4) +#define SET_SRAM_ECC_OFF(VAR) VAR = ((VAR & ~EF_AMDGPU_FEATURE_SRAMECC_V4) \ + | EF_AMDGPU_FEATURE_SRAMECC_OFF_V4) +#define TEST_SRAM_ECC_ANY(VAR) ((VAR & EF_AMDGPU_FEATURE_SRAMECC_V4) \ + == EF_AMDGPU_FEATURE_SRAMECC_ANY_V4) +#define TEST_SRAM_ECC_ON(VAR) ((VAR & EF_AMDGPU_FEATURE_SRAMECC_V4) \ + == EF_AMDGPU_FEATURE_SRAMECC_ON_V4) +#endif #ifndef R_AMDGPU_NONE #define R_AMDGPU_NONE 0 @@ -80,7 +123,13 @@ static struct obstack files_to_cleanup; enum offload_abi offload_abi = OFFLOAD_ABI_UNSET; uint32_t elf_arch = EF_AMDGPU_MACH_AMDGCN_GFX803; // Default GPU architecture. -uint32_t elf_flags = 0; +uint32_t elf_flags = +#ifdef HAVE_GCN_ASM_V3_SYNTAX + 0; +#endif +#ifdef HAVE_GCN_ASM_V4_SYNTAX + (EF_AMDGPU_FEATURE_XNACK_ANY_V4 | EF_AMDGPU_FEATURE_SRAMECC_ANY_V4); +#endif /* Delete tempfiles. */ @@ -851,23 +900,22 @@ main (int argc, char **argv) else if (strcmp (argv[i], "-fpic") == 0) fpic = true; else if (strcmp (argv[i], "-mxnack") == 0) - elf_flags |= EF_AMDGPU_XNACK; + SET_XNACK_ON (elf_flags); else if (strcmp (argv[i], "-mno-xnack") == 0) - elf_flags &= ~EF_AMDGPU_XNACK; + SET_XNACK_OFF (elf_flags); else if (strcmp (argv[i], "-msram-ecc=on") == 0) { - elf_flags |= EF_AMDGPU_SRAM_ECC; + SET_SRAM_ECC_ON (elf_flags); sram_seen = true; } else if (strcmp (argv[i], "-msram-ecc=any") == 0) { - /* FIXME: change this when we move to HSACOv4. */ - elf_flags |= EF_AMDGPU_SRAM_ECC; + SET_SRAM_ECC_ANY (elf_flags); sram_seen = true; } else if (strcmp (argv[i], "-msram-ecc=off") == 0) { - elf_flags &= ~EF_AMDGPU_SRAM_ECC; + SET_SRAM_ECC_OFF (elf_flags); sram_seen = true; } else if (strcmp (argv[i], "-save-temps") == 0) @@ -890,23 +938,27 @@ main (int argc, char **argv) if (!(fopenacc ^ fopenmp)) fatal_error (input_location, "either -fopenacc or -fopenmp must be set"); - /* The SRAM-ECC feature defaults to "any" on GPUs where the feature is - available. */ if (!sram_seen) - switch (elf_arch) - { - case EF_AMDGPU_MACH_AMDGCN_GFX803: - case EF_AMDGPU_MACH_AMDGCN_GFX900: - case EF_AMDGPU_MACH_AMDGCN_GFX906: + { +#ifdef HAVE_GCN_ASM_V3_SYNTAX + /* For HSACOv3, the SRAM-ECC feature defaults to "on" on GPUs where the + feature is available. + (HSACOv4 has elf_flags initialsed to "any" in all cases.) */ + switch (elf_arch) + { + case EF_AMDGPU_MACH_AMDGCN_GFX803: + case EF_AMDGPU_MACH_AMDGCN_GFX900: + case EF_AMDGPU_MACH_AMDGCN_GFX906: #ifndef HAVE_GCN_SRAM_ECC_GFX908 - case EF_AMDGPU_MACH_AMDGCN_GFX908: + case EF_AMDGPU_MACH_AMDGCN_GFX908: #endif - break; - default: - /* FIXME: change this when we move to HSACOv4. */ - elf_flags |= EF_AMDGPU_SRAM_ECC; - break; - } + break; + default: + SET_SRAM_ECC_ON (elf_flags); + break; + } +#endif + } const char *abi; switch (offload_abi) @@ -936,11 +988,12 @@ main (int argc, char **argv) if (fopenmp) obstack_ptr_grow (&cc_argv_obstack, "-mgomp"); obstack_ptr_grow (&cc_argv_obstack, - (elf_flags & EF_AMDGPU_XNACK + (TEST_XNACK (elf_flags) ? "-mxnack" : "-mno-xnack")); obstack_ptr_grow (&cc_argv_obstack, - (elf_flags & EF_AMDGPU_SRAM_ECC - ? "-msram-ecc=on" : "-msram-ecc=off")); + (TEST_SRAM_ECC_ON (elf_flags) ? "-msram-ecc=on" + : TEST_SRAM_ECC_ANY (elf_flags) ? "-msram-ecc=any" + : "-msram-ecc=off")); for (int ix = 1; ix != argc; ix++) { @@ -1043,11 +1096,12 @@ main (int argc, char **argv) obstack_ptr_grow (&ld_argv_obstack, gcn_s2_name); obstack_ptr_grow (&ld_argv_obstack, "-lgomp"); obstack_ptr_grow (&ld_argv_obstack, - (elf_flags & EF_AMDGPU_XNACK + (TEST_XNACK (elf_flags) ? "-mxnack" : "-mno-xnack")); obstack_ptr_grow (&ld_argv_obstack, - (elf_flags & EF_AMDGPU_SRAM_ECC - ? "-msram-ecc=on" : "-msram-ecc=off")); + (TEST_SRAM_ECC_ON (elf_flags) ? "-msram-ecc=on" + : TEST_SRAM_ECC_ANY (elf_flags) ? "-msram-ecc=any" + : "-msram-ecc=off")); if (verbose) obstack_ptr_grow (&ld_argv_obstack, "-v"); diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h index 4714696..29cf679 100644 --- a/gcc/config/i386/avx512fp16intrin.h +++ b/gcc/config/i386/avx512fp16intrin.h @@ -45,6 +45,14 @@ typedef _Float16 __m128h __attribute__ ((__vector_size__ (16), __may_alias__)); typedef _Float16 __m256h __attribute__ ((__vector_size__ (32), __may_alias__)); typedef _Float16 __m512h __attribute__ ((__vector_size__ (64), __may_alias__)); +/* Unaligned version of the same type. */ +typedef _Float16 __m128h_u __attribute__ ((__vector_size__ (16), \ + __may_alias__, __aligned__ (1))); +typedef _Float16 __m256h_u __attribute__ ((__vector_size__ (32), \ + __may_alias__, __aligned__ (1))); +typedef _Float16 __m512h_u __attribute__ ((__vector_size__ (64), \ + __may_alias__, __aligned__ (1))); + extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_ph (_Float16 __A7, _Float16 __A6, _Float16 __A5, @@ -362,6 +370,48 @@ _mm_load_sh (void const *__P) *(_Float16 const *) __P); } +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_load_ph (void const *__P) +{ + return *(const __m512h *) __P; +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_load_ph (void const *__P) +{ + return *(const __m256h *) __P; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_ph (void const *__P) +{ + return *(const __m128h *) __P; +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadu_ph (void const *__P) +{ + return *(const __m512h_u *) __P; +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_ph (void const *__P) +{ + return *(const __m256h_u *) __P; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_ph (void const *__P) +{ + return *(const __m128h_u *) __P; +} + /* Stores the lower _Float16 value. */ extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -370,6 +420,56 @@ _mm_store_sh (void *__P, __m128h __A) *(_Float16 *) __P = ((__v8hf)__A)[0]; } +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_store_ph (void *__P, __m512h __A) +{ + *(__m512h *) __P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_store_ph (void *__P, __m256h __A) +{ + *(__m256h *) __P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_ph (void *__P, __m128h __A) +{ + *(__m128h *) __P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_ph (void *__P, __m512h __A) +{ + *(__m512h_u *) __P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu_ph (void *__P, __m256h __A) +{ + *(__m256h_u *) __P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_ph (void *__P, __m128h __A) +{ + *(__m128h_u *) __P = __A; +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_abs_ph (__m512h __A) +{ + return (__m512h) _mm512_and_epi32 ( _mm512_set1_epi32 (0x7FFF7FFF), + (__m512i) __A); +} + /* Intrinsics v[add,sub,mul,div]ph. */ extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -621,6 +721,33 @@ _mm512_maskz_div_round_ph (__mmask32 __A, __m512h __B, __m512h __C, (A), (D))) #endif /* __OPTIMIZE__ */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_conj_pch (__m512h __A) +{ + return (__m512h) _mm512_xor_epi32 ((__m512i) __A, _mm512_set1_epi32 (1<<31)); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_conj_pch (__m512h __W, __mmask16 __U, __m512h __A) +{ + return (__m512h) + __builtin_ia32_movaps512_mask ((__v16sf) _mm512_conj_pch (__A), + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_conj_pch (__mmask16 __U, __m512h __A) +{ + return (__m512h) + __builtin_ia32_movaps512_mask ((__v16sf) _mm512_conj_pch (__A), + (__v16sf) _mm512_setzero_ps (), + (__mmask16) __U); +} + /* Intrinsics of v[add,sub,mul,div]sh. */ extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -6115,6 +6242,1006 @@ _mm_maskz_fnmsub_round_sh (__mmask8 __U, __m128h __W, __m128h __A, #endif /* __OPTIMIZE__ */ +/* Intrinsics vf[,c]maddcph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fcmadd_pch (__m512h __A, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfcmaddcph512_round ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fcmadd_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D) +{ + return (__m512h) __builtin_ia32_movaps512_mask + ((__v16sf) + __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A, + (__v32hf) __C, + (__v32hf) __D, __B, + _MM_FROUND_CUR_DIRECTION), + (__v16sf) __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fcmadd_pch (__m512h __A, __m512h __B, __m512h __C, __mmask16 __D) +{ + return (__m512h) + __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + __D, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fcmadd_pch (__mmask16 __A, __m512h __B, __m512h __C, __m512h __D) +{ + return (__m512h) + __builtin_ia32_vfcmaddcph512_maskz_round ((__v32hf) __B, + (__v32hf) __C, + (__v32hf) __D, + __A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmadd_pch (__m512h __A, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfmaddcph512_round ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmadd_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D) +{ + return (__m512h) __builtin_ia32_movaps512_mask + ((__v16sf) + __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A, + (__v32hf) __C, + (__v32hf) __D, __B, + _MM_FROUND_CUR_DIRECTION), + (__v16sf) __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmadd_pch (__m512h __A, __m512h __B, __m512h __C, __mmask16 __D) +{ + return (__m512h) + __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + __D, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmadd_pch (__mmask16 __A, __m512h __B, __m512h __C, __m512h __D) +{ + return (__m512h) + __builtin_ia32_vfmaddcph512_maskz_round ((__v32hf) __B, + (__v32hf) __C, + (__v32hf) __D, + __A, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fcmadd_round_pch (__m512h __A, __m512h __B, __m512h __C, const int __D) +{ + return (__m512h) + __builtin_ia32_vfcmaddcph512_round ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fcmadd_round_pch (__m512h __A, __mmask16 __B, __m512h __C, + __m512h __D, const int __E) +{ + return (__m512h) __builtin_ia32_movaps512_mask + ((__v16sf) + __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A, + (__v32hf) __C, + (__v32hf) __D, __B, + __E), + (__v16sf) __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fcmadd_round_pch (__m512h __A, __m512h __B, __m512h __C, + __mmask16 __D, const int __E) +{ + return (__m512h) + __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + __D, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fcmadd_round_pch (__mmask16 __A, __m512h __B, __m512h __C, + __m512h __D, const int __E) +{ + return (__m512h) + __builtin_ia32_vfcmaddcph512_maskz_round ((__v32hf) __B, + (__v32hf) __C, + (__v32hf) __D, + __A, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmadd_round_pch (__m512h __A, __m512h __B, __m512h __C, const int __D) +{ + return (__m512h) + __builtin_ia32_vfmaddcph512_round ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmadd_round_pch (__m512h __A, __mmask16 __B, __m512h __C, + __m512h __D, const int __E) +{ + return (__m512h) __builtin_ia32_movaps512_mask + ((__v16sf) + __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A, + (__v32hf) __C, + (__v32hf) __D, __B, + __E), + (__v16sf) __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmadd_round_pch (__m512h __A, __m512h __B, __m512h __C, + __mmask16 __D, const int __E) +{ + return (__m512h) + __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + __D, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmadd_round_pch (__mmask16 __A, __m512h __B, __m512h __C, + __m512h __D, const int __E) +{ + return (__m512h) + __builtin_ia32_vfmaddcph512_maskz_round ((__v32hf) __B, + (__v32hf) __C, + (__v32hf) __D, + __A, __E); +} + +#else +#define _mm512_fcmadd_round_pch(A, B, C, D) \ + (__m512h) __builtin_ia32_vfcmaddcph512_round ((A), (B), (C), (D)) + +#define _mm512_mask_fcmadd_round_pch(A, B, C, D, E) \ + ((__m512h) __builtin_ia32_movaps512_mask ( \ + (__v16sf) \ + __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) (A), \ + (__v32hf) (C), \ + (__v32hf) (D), \ + (B), (E)), \ + (__v16sf) (A), (B))); + + +#define _mm512_mask3_fcmadd_round_pch(A, B, C, D, E) \ + ((__m512h) \ + __builtin_ia32_vfcmaddcph512_mask_round ((A), (B), (C), (D), (E))) + +#define _mm512_maskz_fcmadd_round_pch(A, B, C, D, E) \ + (__m512h) \ + __builtin_ia32_vfcmaddcph512_maskz_round ((B), (C), (D), (A), (E)) + +#define _mm512_fmadd_round_pch(A, B, C, D) \ + (__m512h) __builtin_ia32_vfmaddcph512_round ((A), (B), (C), (D)) + +#define _mm512_mask_fmadd_round_pch(A, B, C, D, E) \ + ((__m512h) __builtin_ia32_movaps512_mask ( \ + (__v16sf) \ + __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) (A), \ + (__v32hf) (C), \ + (__v32hf) (D), \ + (B), (E)), \ + (__v16sf) (A), (B))); + +#define _mm512_mask3_fmadd_round_pch(A, B, C, D, E) \ + (__m512h) \ + __builtin_ia32_vfmaddcph512_mask_round ((A), (B), (C), (D), (E)) + +#define _mm512_maskz_fmadd_round_pch(A, B, C, D, E) \ + (__m512h) \ + __builtin_ia32_vfmaddcph512_maskz_round ((B), (C), (D), (A), (E)) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vf[,c]mulcph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fcmul_pch (__m512h __A, __m512h __B) +{ + return (__m512h) + __builtin_ia32_vfcmulcph512_round ((__v32hf) __A, + (__v32hf) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fcmul_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D) +{ + return (__m512h) + __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __C, + (__v32hf) __D, + (__v32hf) __A, + __B, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fcmul_pch (__mmask16 __A, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __B, + (__v32hf) __C, + _mm512_setzero_ph (), + __A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmul_pch (__m512h __A, __m512h __B) +{ + return (__m512h) + __builtin_ia32_vfmulcph512_round ((__v32hf) __A, + (__v32hf) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmul_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D) +{ + return (__m512h) + __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __C, + (__v32hf) __D, + (__v32hf) __A, + __B, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmul_pch (__mmask16 __A, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __B, + (__v32hf) __C, + _mm512_setzero_ph (), + __A, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fcmul_round_pch (__m512h __A, __m512h __B, const int __D) +{ + return (__m512h) + __builtin_ia32_vfcmulcph512_round ((__v32hf) __A, + (__v32hf) __B, __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fcmul_round_pch (__m512h __A, __mmask16 __B, __m512h __C, + __m512h __D, const int __E) +{ + return (__m512h) + __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __C, + (__v32hf) __D, + (__v32hf) __A, + __B, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fcmul_round_pch (__mmask16 __A, __m512h __B, + __m512h __C, const int __E) +{ + return (__m512h) + __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __B, + (__v32hf) __C, + _mm512_setzero_ph (), + __A, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmul_round_pch (__m512h __A, __m512h __B, const int __D) +{ + return (__m512h) + __builtin_ia32_vfmulcph512_round ((__v32hf) __A, + (__v32hf) __B, + __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmul_round_pch (__m512h __A, __mmask16 __B, __m512h __C, + __m512h __D, const int __E) +{ + return (__m512h) + __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __C, + (__v32hf) __D, + (__v32hf) __A, + __B, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmul_round_pch (__mmask16 __A, __m512h __B, + __m512h __C, const int __E) +{ + return (__m512h) + __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __B, + (__v32hf) __C, + _mm512_setzero_ph (), + __A, __E); +} + +#else +#define _mm512_fcmul_round_pch(A, B, D) \ + (__m512h) __builtin_ia32_vfcmulcph512_round ((A), (B), (D)) + +#define _mm512_mask_fcmul_round_pch(A, B, C, D, E) \ + (__m512h) __builtin_ia32_vfcmulcph512_mask_round ((C), (D), (A), (B), (E)) + +#define _mm512_maskz_fcmul_round_pch(A, B, C, E) \ + (__m512h) __builtin_ia32_vfcmulcph512_mask_round ((B), (C), \ + (__v32hf) \ + _mm512_setzero_ph (), \ + (A), (E)) + +#define _mm512_fmul_round_pch(A, B, D) \ + (__m512h) __builtin_ia32_vfmulcph512_round ((A), (B), (D)) + +#define _mm512_mask_fmul_round_pch(A, B, C, D, E) \ + (__m512h) __builtin_ia32_vfmulcph512_mask_round ((C), (D), (A), (B), (E)) + +#define _mm512_maskz_fmul_round_pch(A, B, C, E) \ + (__m512h) __builtin_ia32_vfmulcph512_mask_round ((B), (C), \ + (__v32hf) \ + _mm512_setzero_ph (), \ + (A), (E)) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vf[,c]maddcsh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fcmadd_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ +#ifdef __AVX512VL__ + return (__m128h) __builtin_ia32_movaps128_mask ( + (__v4sf) + __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A, + (__v8hf) __C, + (__v8hf) __D, __B, + _MM_FROUND_CUR_DIRECTION), + (__v4sf) __A, __B); +#else + return (__m128h) __builtin_ia32_blendvps ((__v4sf) __A, + (__v4sf) + __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A, + (__v8hf) __C, + (__v8hf) __D, __B, + _MM_FROUND_CUR_DIRECTION), + (__v4sf) _mm_set_ss ((float) ((int) __B << 31))); +#endif +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fcmadd_sch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D) +{ + return (__m128h) _mm_move_ss ((__m128) __C, + (__m128) + __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, __D, + _MM_FROUND_CUR_DIRECTION)); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fcmadd_sch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D) +{ + return (__m128h) + __builtin_ia32_vfcmaddcsh_maskz_round ((__v8hf) __B, + (__v8hf) __C, + (__v8hf) __D, + __A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fcmadd_sch (__m128h __A, __m128h __B, __m128h __C) +{ + return (__m128h) + __builtin_ia32_vfcmaddcsh_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ +#ifdef __AVX512VL__ + return (__m128h) __builtin_ia32_movaps128_mask ( + (__v4sf) + __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A, + (__v8hf) __C, + (__v8hf) __D, __B, + _MM_FROUND_CUR_DIRECTION), + (__v4sf) __A, __B); +#else + return (__m128h) __builtin_ia32_blendvps ((__v4sf) __A, + (__v4sf) + __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A, + (__v8hf) __C, + (__v8hf) __D, __B, + _MM_FROUND_CUR_DIRECTION), + (__v4sf) _mm_set_ss ((float) ((int) __B << 31))); +#endif +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_sch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D) +{ + return (__m128h) _mm_move_ss ((__m128) __C, + (__m128) + __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, __D, + _MM_FROUND_CUR_DIRECTION)); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_sch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D) +{ + return (__m128h) + __builtin_ia32_vfmaddcsh_maskz_round ((__v8hf) __B, + (__v8hf) __C, + (__v8hf) __D, + __A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_sch (__m128h __A, __m128h __B, __m128h __C) +{ + return (__m128h) + __builtin_ia32_vfmaddcsh_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fcmadd_round_sch (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ +#ifdef __AVX512VL__ + return (__m128h) __builtin_ia32_movaps128_mask ( + (__v4sf) + __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A, + (__v8hf) __C, + (__v8hf) __D, + __B, __E), + (__v4sf) __A, __B); +#else + return (__m128h) __builtin_ia32_blendvps ((__v4sf) __A, + (__v4sf) + __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A, + (__v8hf) __C, + (__v8hf) __D, + __B, __E), + (__v4sf) _mm_set_ss ((float) ((int) __B << 31))); +#endif +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fcmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, + __mmask8 __D, const int __E) +{ + return (__m128h) _mm_move_ss ((__m128) __C, + (__m128) + __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + __D, __E)); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fcmadd_round_sch (__mmask8 __A, __m128h __B, __m128h __C, + __m128h __D, const int __E) +{ + return (__m128h) + __builtin_ia32_vfcmaddcsh_maskz_round ((__v8hf) __B, + (__v8hf) __C, + (__v8hf) __D, + __A, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fcmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, const int __D) +{ + return (__m128h) + __builtin_ia32_vfcmaddcsh_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_round_sch (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ +#ifdef __AVX512VL__ + return (__m128h) __builtin_ia32_movaps128_mask ( + (__v4sf) + __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A, + (__v8hf) __C, + (__v8hf) __D, + __B, __E), + (__v4sf) __A, __B); +#else + return (__m128h) __builtin_ia32_blendvps ((__v4sf) __A, + (__v4sf) + __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A, + (__v8hf) __C, + (__v8hf) __D, + __B, __E), + (__v4sf) _mm_set_ss ((float) ((int) __B << 31))); +#endif +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, + __mmask8 __D, const int __E) +{ + return (__m128h) _mm_move_ss ((__m128) __C, + (__m128) + __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + __D, __E)); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_round_sch (__mmask8 __A, __m128h __B, __m128h __C, + __m128h __D, const int __E) +{ + return (__m128h) + __builtin_ia32_vfmaddcsh_maskz_round ((__v8hf) __B, + (__v8hf) __C, + (__v8hf) __D, + __A, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, const int __D) +{ + return (__m128h) + __builtin_ia32_vfmaddcsh_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + __D); +} +#else +#ifdef __AVX512VL__ +#define _mm_mask_fcmadd_round_sch(A, B, C, D, E) \ + ((__m128h) __builtin_ia32_movaps128_mask ( \ + (__v4sf) \ + __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) (A), \ + (__v8hf) (C), \ + (__v8hf) (D), \ + (B), (E)), \ + (__v4sf) (A), (B))) + +#else +#define _mm_mask_fcmadd_round_sch(A, B, C, D, E) \ + ((__m128h) __builtin_ia32_blendvps ((__v4sf) (A), \ + (__v4sf) \ + __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) (A), \ + (__v8hf) (C), \ + (__v8hf) (D), \ + (B), (E)), \ + (__v4sf) _mm_set_ss ((float) ((int) (B) << 31)))) +#endif + +#define _mm_mask3_fcmadd_round_sch(A, B, C, D, E) \ + ((__m128h) _mm_move_ss ((__m128) (C), \ + (__m128) \ + __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) (A), \ + (__v8hf) (B), \ + (__v8hf) (C), \ + (D), (E)))) + +#define _mm_maskz_fcmadd_round_sch(A, B, C, D, E) \ + __builtin_ia32_vfcmaddcsh_maskz_round ((B), (C), (D), (A), (E)) + +#define _mm_fcmadd_round_sch(A, B, C, D) \ + __builtin_ia32_vfcmaddcsh_round ((A), (B), (C), (D)) + +#ifdef __AVX512VL__ +#define _mm_mask_fmadd_round_sch(A, B, C, D, E) \ + ((__m128h) __builtin_ia32_movaps128_mask ( \ + (__v4sf) \ + __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) (A), \ + (__v8hf) (C), \ + (__v8hf) (D), \ + (B), (E)), \ + (__v4sf) (A), (B))) + +#else +#define _mm_mask_fmadd_round_sch(A, B, C, D, E) \ + ((__m128h) __builtin_ia32_blendvps ((__v4sf) (A), \ + (__v4sf) \ + __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) (A), \ + (__v8hf) (C), \ + (__v8hf) (D), \ + (B), (E)), \ + (__v4sf) _mm_set_ss ((float) ((int) (B) << 31)))) +#endif + +#define _mm_mask3_fmadd_round_sch(A, B, C, D, E) \ + ((__m128h) _mm_move_ss ((__m128) (C), \ + (__m128) \ + __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) (A), \ + (__v8hf) (B), \ + (__v8hf) (C), \ + (D), (E)))) + +#define _mm_maskz_fmadd_round_sch(A, B, C, D, E) \ + __builtin_ia32_vfmaddcsh_maskz_round ((B), (C), (D), (A), (E)) + +#define _mm_fmadd_round_sch(A, B, C, D) \ + __builtin_ia32_vfmaddcsh_round ((A), (B), (C), (D)) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vf[,c]mulcsh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fcmul_sch (__m128h __A, __m128h __B) +{ + return (__m128h) + __builtin_ia32_vfcmulcsh_round ((__v8hf) __A, + (__v8hf) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fcmul_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return (__m128h) + __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __C, + (__v8hf) __D, + (__v8hf) __A, + __B, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fcmul_sch (__mmask8 __A, __m128h __B, __m128h __C) +{ + return (__m128h) + __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __B, + (__v8hf) __C, + _mm_setzero_ph (), + __A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmul_sch (__m128h __A, __m128h __B) +{ + return (__m128h) + __builtin_ia32_vfmulcsh_round ((__v8hf) __A, + (__v8hf) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmul_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return (__m128h) + __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __C, + (__v8hf) __D, + (__v8hf) __A, + __B, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmul_sch (__mmask8 __A, __m128h __B, __m128h __C) +{ + return (__m128h) + __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __B, + (__v8hf) __C, + _mm_setzero_ph (), + __A, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fcmul_round_sch (__m128h __A, __m128h __B, const int __D) +{ + return (__m128h) + __builtin_ia32_vfcmulcsh_round ((__v8hf) __A, + (__v8hf) __B, + __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fcmul_round_sch (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ + return (__m128h) + __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __C, + (__v8hf) __D, + (__v8hf) __A, + __B, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fcmul_round_sch (__mmask8 __A, __m128h __B, __m128h __C, + const int __E) +{ + return (__m128h) + __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __B, + (__v8hf) __C, + _mm_setzero_ph (), + __A, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmul_round_sch (__m128h __A, __m128h __B, const int __D) +{ + return (__m128h) + __builtin_ia32_vfmulcsh_round ((__v8hf) __A, + (__v8hf) __B, __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmul_round_sch (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ + return (__m128h) + __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __C, + (__v8hf) __D, + (__v8hf) __A, + __B, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmul_round_sch (__mmask8 __A, __m128h __B, __m128h __C, const int __E) +{ + return (__m128h) + __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __B, + (__v8hf) __C, + _mm_setzero_ph (), + __A, __E); +} + +#else +#define _mm_fcmul_round_sch(__A, __B, __D) \ + (__m128h) __builtin_ia32_vfcmulcsh_round ((__v8hf) __A, \ + (__v8hf) __B, __D) + +#define _mm_mask_fcmul_round_sch(__A, __B, __C, __D, __E) \ + (__m128h) __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __C, \ + (__v8hf) __D, \ + (__v8hf) __A, \ + __B, __E) + +#define _mm_maskz_fcmul_round_sch(__A, __B, __C, __E) \ + (__m128h) __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __B, \ + (__v8hf) __C, \ + _mm_setzero_ph (), \ + __A, __E) + +#define _mm_fmul_round_sch(__A, __B, __D) \ + (__m128h) __builtin_ia32_vfmulcsh_round ((__v8hf) __A, \ + (__v8hf) __B, __D) + +#define _mm_mask_fmul_round_sch(__A, __B, __C, __D, __E) \ + (__m128h) __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __C, \ + (__v8hf) __D, \ + (__v8hf) __A, \ + __B, __E) + +#define _mm_maskz_fmul_round_sch(__A, __B, __C, __E) \ + (__m128h) __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __B, \ + (__v8hf) __C, \ + _mm_setzero_ph (), \ + __A, __E) + +#endif /* __OPTIMIZE__ */ + +#define _MM512_REDUCE_OP(op) \ + __m256h __T1 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 0); \ + __m256h __T2 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 1); \ + __m256h __T3 = (__T1 op __T2); \ + __m128h __T4 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 0); \ + __m128h __T5 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 1); \ + __m128h __T6 = (__T4 op __T5); \ + __m128h __T7 = (__m128h) __builtin_shuffle ((__m128h)__T6, \ + (__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 }); \ + __m128h __T8 = (__T6 op __T7); \ + __m128h __T9 = (__m128h) __builtin_shuffle ((__m128h)__T8, \ + (__v8hi) { 2, 3, 0, 1, 4, 5, 6, 7 }); \ + __m128h __T10 = __T8 op __T9; \ + return __T10[0] op __T10[1] + +// TODO reduce +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_add_ph (__m512h __A) +{ + _MM512_REDUCE_OP (+); +} + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_mul_ph (__m512h __A) +{ + _MM512_REDUCE_OP (*); +} + +#undef _MM512_REDUCE_OP + +#ifdef __AVX512VL__ + +#define _MM512_REDUCE_OP(op) \ + __m256h __T1 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 0); \ + __m256h __T2 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 1); \ + __m256h __T3 = __builtin_ia32_##op##ph256_mask (__T1, __T2, \ + _mm256_setzero_ph (), (__mmask16) -1); \ + __m128h __T4 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 0); \ + __m128h __T5 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 1); \ + __m128h __T6 = __builtin_ia32_##op##ph128_mask \ + (__T4, __T5, _mm_setzero_ph (),(__mmask8) -1); \ + __m128h __T7 = (__m128h) __builtin_shuffle ((__m128h)__T6, \ + (__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 }); \ + __m128h __T8 = (__m128h) __builtin_ia32_##op##ph128_mask \ + (__T6, __T7, _mm_setzero_ph (),(__mmask8) -1); \ + __m128h __T9 = (__m128h) __builtin_shuffle ((__m128h)__T8, \ + (__v8hi) { 4, 5 }); \ + __m128h __T10 = __builtin_ia32_##op##ph128_mask \ + (__T8, __T9, _mm_setzero_ph (),(__mmask8) -1); \ + __m128h __T11 = (__m128h) __builtin_shuffle (__T10, \ + (__v8hi) { 1, 0 }); \ + __m128h __T12 = __builtin_ia32_##op##ph128_mask \ + (__T10, __T11, _mm_setzero_ph (),(__mmask8) -1); \ + return __T12[0] + +#else + +#define _MM512_REDUCE_OP(op) \ + __m512h __T1 = (__m512h) __builtin_shuffle ((__m512d) __A, \ + (__v8di) { 4, 5, 6, 7, 0, 0, 0, 0 }); \ + __m512h __T2 = _mm512_##op##_ph (__A, __T1); \ + __m512h __T3 = (__m512h) __builtin_shuffle ((__m512d) __T2, \ + (__v8di) { 2, 3, 0, 0, 0, 0, 0, 0 }); \ + __m512h __T4 = _mm512_##op##_ph (__T2, __T3); \ + __m512h __T5 = (__m512h) __builtin_shuffle ((__m512d) __T4, \ + (__v8di) { 1, 0, 0, 0, 0, 0, 0, 0 }); \ + __m512h __T6 = _mm512_##op##_ph (__T4, __T5); \ + __m512h __T7 = (__m512h) __builtin_shuffle ((__m512) __T6, \ + (__v16si) { 1, 0, 0, 0, 0, 0, 0, 0, \ + 0, 0, 0, 0, 0, 0, 0, 0 }); \ + __m512h __T8 = _mm512_##op##_ph (__T6, __T7); \ + __m512h __T9 = (__m512h) __builtin_shuffle (__T8, \ + (__v32hi) { 1, 0, 0, 0, 0, 0, 0, 0, \ + 0, 0, 0, 0, 0, 0, 0, 0, \ + 0, 0, 0, 0, 0, 0, 0, 0, \ + 0, 0, 0, 0, 0, 0, 0, 0 }); \ + __m512h __T10 = _mm512_##op##_ph (__T8, __T9); \ + return __T10[0] +#endif + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_min_ph (__m512h __A) +{ + _MM512_REDUCE_OP (min); +} + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_max_ph (__m512h __A) +{ + _MM512_REDUCE_OP (max); +} + +#undef _MM512_REDUCE_OP + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_blend_ph (__mmask32 __U, __m512h __A, __m512h __W) +{ + return (__m512h) __builtin_ia32_movdquhi512_mask ((__v32hi) __W, + (__v32hi) __A, + (__mmask32) __U); + +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex2var_ph (__m512h __A, __m512i __I, __m512h __B) +{ + return (__m512h) __builtin_ia32_vpermi2varhi512_mask ((__v32hi) __A, + (__v32hi) __I, + (__v32hi) __B, + (__mmask32)-1); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutexvar_ph (__m512i __A, __m512h __B) +{ + return (__m512h) __builtin_ia32_permvarhi512_mask ((__v32hi) __B, + (__v32hi) __A, + (__v32hi) + (_mm512_setzero_ph ()), + (__mmask32)-1); +} + #ifdef __DISABLE_AVX512FP16__ #undef __DISABLE_AVX512FP16__ #pragma GCC pop_options diff --git a/gcc/config/i386/avx512fp16vlintrin.h b/gcc/config/i386/avx512fp16vlintrin.h index 1292c02..3d3de96 100644 --- a/gcc/config/i386/avx512fp16vlintrin.h +++ b/gcc/config/i386/avx512fp16vlintrin.h @@ -151,6 +151,59 @@ _mm256_zextph128_ph256 (__m128h __A) (__m128) __A, 0); } +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_conj_pch (__m256h __A) +{ + return (__m256h) _mm256_xor_epi32 ((__m256i) __A, _mm256_set1_epi32 (1<<31)); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_conj_pch (__m256h __W, __mmask8 __U, __m256h __A) +{ + return (__m256h) __builtin_ia32_movaps256_mask ((__v8sf) + _mm256_conj_pch (__A), + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_conj_pch (__mmask8 __U, __m256h __A) +{ + return (__m256h) __builtin_ia32_movaps256_mask ((__v8sf) + _mm256_conj_pch (__A), + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_conj_pch (__m128h __A) +{ + return (__m128h) _mm_xor_epi32 ((__m128i) __A, _mm_set1_epi32 (1<<31)); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_conj_pch (__m128h __W, __mmask8 __U, __m128h __A) +{ + return (__m128h) __builtin_ia32_movaps128_mask ((__v4sf) _mm_conj_pch (__A), + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_conj_pch (__mmask8 __U, __m128h __A) +{ + return (__m128h) __builtin_ia32_movaps128_mask ((__v4sf) _mm_conj_pch (__A), + (__v4sf) _mm_setzero_ps (), + (__mmask8) __U); +} + /* Intrinsics v[add,sub,mul,div]ph. */ extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -425,6 +478,22 @@ _mm256_maskz_min_ph (__mmask16 __A, __m256h __B, __m256h __C) _mm256_setzero_ph (), __A); } +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_ph (__m128h __A) +{ + return (__m128h) _mm_and_si128 ( _mm_set1_epi32 (0x7FFF7FFF), + (__m128i) __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_abs_ph (__m256h __A) +{ + return (__m256h) _mm256_and_si256 ( _mm256_set1_epi32 (0x7FFF7FFF), + (__m256i) __A); +} + /* vcmpph */ #ifdef __OPTIMIZE extern __inline __mmask8 @@ -2815,6 +2884,437 @@ _mm_maskz_fnmsub_ph (__mmask8 __U, __m128h __A, __m128h __B, __U); } +/* Intrinsics vf[,c]maddcph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_pch (__m128h __A, __m128h __B, __m128h __C) +{ + return (__m128h) __builtin_ia32_vfmaddcph128 ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return (__m128h) __builtin_ia32_movaps128_mask + ((__v4sf) + __builtin_ia32_vfmaddcph128_mask ((__v8hf) __A, + (__v8hf) __C, + (__v8hf) __D, __B), + (__v4sf) __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_pch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D) +{ + return (__m128h) __builtin_ia32_vfmaddcph128_mask ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_pch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D) +{ + return (__m128h) __builtin_ia32_vfmaddcph128_maskz ((__v8hf) __B, + (__v8hf) __C, + (__v8hf) __D, __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmadd_pch (__m256h __A, __m256h __B, __m256h __C) +{ + return (__m256h) __builtin_ia32_vfmaddcph256 ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmadd_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D) +{ + return (__m256h) __builtin_ia32_movaps256_mask + ((__v8sf) + __builtin_ia32_vfmaddcph256_mask ((__v16hf) __A, + (__v16hf) __C, + (__v16hf) __D, __B), + (__v8sf) __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmadd_pch (__m256h __A, __m256h __B, __m256h __C, __mmask8 __D) +{ + return (__m256h) __builtin_ia32_vfmaddcph256_mask ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, __D); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmadd_pch (__mmask8 __A, __m256h __B, __m256h __C, __m256h __D) +{ + return (__m256h)__builtin_ia32_vfmaddcph256_maskz ((__v16hf) __B, + (__v16hf) __C, + (__v16hf) __D, __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fcmadd_pch (__m128h __A, __m128h __B, __m128h __C) +{ + return (__m128h) __builtin_ia32_vfcmaddcph128 ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fcmadd_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return (__m128h) __builtin_ia32_movaps128_mask + ((__v4sf) + __builtin_ia32_vfcmaddcph128_mask ((__v8hf) __A, + (__v8hf) __C, + (__v8hf) __D, __B), + (__v4sf) __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fcmadd_pch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D) +{ + return (__m128h) __builtin_ia32_vfcmaddcph128_mask ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fcmadd_pch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D) +{ + return (__m128h)__builtin_ia32_vfcmaddcph128_maskz ((__v8hf) __B, + (__v8hf) __C, + (__v8hf) __D, __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fcmadd_pch (__m256h __A, __m256h __B, __m256h __C) +{ + return (__m256h) __builtin_ia32_vfcmaddcph256 ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fcmadd_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D) +{ + return (__m256h) __builtin_ia32_movaps256_mask + ((__v8sf) + __builtin_ia32_vfcmaddcph256_mask ((__v16hf) __A, + (__v16hf) __C, + (__v16hf) __D, __B), + (__v8sf) __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fcmadd_pch (__m256h __A, __m256h __B, __m256h __C, __mmask8 __D) +{ + return (__m256h) __builtin_ia32_vfcmaddcph256_mask ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, __D); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fcmadd_pch (__mmask8 __A, __m256h __B, __m256h __C, __m256h __D) +{ + return (__m256h) __builtin_ia32_vfcmaddcph256_maskz ((__v16hf) __B, + (__v16hf) __C, + (__v16hf) __D, __A); +} + +/* Intrinsics vf[,c]mulcph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmul_pch (__m128h __A, __m128h __B) +{ + return (__m128h) __builtin_ia32_vfmulcph128 ((__v8hf) __A, (__v8hf) __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmul_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return (__m128h) __builtin_ia32_vfmulcph128_mask ((__v8hf) __C, + (__v8hf) __D, + (__v8hf) __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmul_pch (__mmask8 __A, __m128h __B, __m128h __C) +{ + return (__m128h) __builtin_ia32_vfmulcph128_mask ((__v8hf) __B, + (__v8hf) __C, + _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmul_pch (__m256h __A, __m256h __B) +{ + return (__m256h) __builtin_ia32_vfmulcph256 ((__v16hf) __A, + (__v16hf) __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmul_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D) +{ + return (__m256h) __builtin_ia32_vfmulcph256_mask ((__v16hf) __C, + (__v16hf) __D, + (__v16hf) __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmul_pch (__mmask8 __A, __m256h __B, __m256h __C) +{ + return (__m256h) __builtin_ia32_vfmulcph256_mask ((__v16hf) __B, + (__v16hf) __C, + _mm256_setzero_ph (), + __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fcmul_pch (__m128h __A, __m128h __B) +{ + return (__m128h) __builtin_ia32_vfcmulcph128 ((__v8hf) __A, + (__v8hf) __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fcmul_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return (__m128h) __builtin_ia32_vfcmulcph128_mask ((__v8hf) __C, + (__v8hf) __D, + (__v8hf) __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fcmul_pch (__mmask8 __A, __m128h __B, __m128h __C) +{ + return (__m128h) __builtin_ia32_vfcmulcph128_mask ((__v8hf) __B, + (__v8hf) __C, + _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fcmul_pch (__m256h __A, __m256h __B) +{ + return (__m256h) __builtin_ia32_vfcmulcph256 ((__v16hf) __A, (__v16hf) __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fcmul_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D) +{ + return (__m256h) __builtin_ia32_vfcmulcph256_mask ((__v16hf) __C, + (__v16hf) __D, + (__v16hf) __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fcmul_pch (__mmask8 __A, __m256h __B, __m256h __C) +{ + return (__m256h) __builtin_ia32_vfcmulcph256_mask ((__v16hf) __B, + (__v16hf) __C, + _mm256_setzero_ph (), + __A); +} + +#define _MM256_REDUCE_OP(op) \ + __m128h __T1 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 0); \ + __m128h __T2 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 1); \ + __m128h __T3 = (__T1 op __T2); \ + __m128h __T4 = (__m128h) __builtin_shuffle (__T3, \ + (__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 }); \ + __m128h __T5 = (__T3) op (__T4); \ + __m128h __T6 = (__m128h) __builtin_shuffle (__T5, \ + (__v8hi) { 2, 3, 0, 1, 4, 5, 6, 7 }); \ + __m128h __T7 = __T5 op __T6; \ + return __T7[0] op __T7[1] + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_add_ph (__m256h __A) +{ + _MM256_REDUCE_OP (+); +} + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_mul_ph (__m256h __A) +{ + _MM256_REDUCE_OP (*); +} + +#undef _MM256_REDUCE_OP +#define _MM256_REDUCE_OP(op) \ + __m128h __T1 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 0); \ + __m128h __T2 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 1); \ + __m128h __T3 = _mm_##op (__T1, __T2); \ + __m128h __T4 = (__m128h) __builtin_shuffle (__T3, \ + (__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 }); \ + __m128h __T5 = _mm_##op (__T3, __T4); \ + __m128h __T6 = (__m128h) __builtin_shuffle (__T5, (__v8hi) { 4, 5 }); \ + __m128h __T7 = _mm_##op (__T5, __T6); \ + __m128h __T8 = (__m128h) __builtin_shuffle (__T7, (__v8hi) { 1, 0 }); \ + __m128h __T9 = _mm_##op (__T7, __T8); \ + return __T9[0] + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_min_ph (__m256h __A) +{ + _MM256_REDUCE_OP (min_ph); +} + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_max_ph (__m256h __A) +{ + _MM256_REDUCE_OP (max_ph); +} + +#define _MM_REDUCE_OP(op) \ + __m128h __T1 = (__m128h) __builtin_shuffle (__A, \ + (__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 }); \ + __m128h __T2 = (__A) op (__T1); \ + __m128h __T3 = (__m128h) __builtin_shuffle (__T2, \ + (__v8hi){ 2, 3, 0, 1, 4, 5, 6, 7 }); \ + __m128h __T4 = __T2 op __T3; \ + return __T4[0] op __T4[1] + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_add_ph (__m128h __A) +{ + _MM_REDUCE_OP (+); +} + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_mul_ph (__m128h __A) +{ + _MM_REDUCE_OP (*); +} + +#undef _MM_REDUCE_OP +#define _MM_REDUCE_OP(op) \ + __m128h __T1 = (__m128h) __builtin_shuffle (__A, \ + (__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 }); \ + __m128h __T2 = _mm_##op (__A, __T1); \ + __m128h __T3 = (__m128h) __builtin_shuffle (__T2, (__v8hi){ 4, 5 }); \ + __m128h __T4 = _mm_##op (__T2, __T3); \ + __m128h __T5 = (__m128h) __builtin_shuffle (__T4, (__v8hi){ 1, 0 }); \ + __m128h __T6 = _mm_##op (__T4, __T5); \ + return __T6[0] + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_min_ph (__m128h __A) +{ + _MM_REDUCE_OP (min_ph); +} + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_max_ph (__m128h __A) +{ + _MM_REDUCE_OP (max_ph); +} + +#undef _MM256_REDUCE_OP +#undef _MM_REDUCE_OP + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_blend_ph (__mmask16 __U, __m256h __A, __m256h __W) +{ + return (__m256h) __builtin_ia32_movdquhi256_mask ((__v16hi) __W, + (__v16hi) __A, + (__mmask16) __U); + +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex2var_ph (__m256h __A, __m256i __I, __m256h __B) +{ + return (__m256h) __builtin_ia32_vpermi2varhi256_mask ((__v16hi) __A, + (__v16hi) __I, + (__v16hi) __B, + (__mmask16)-1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutexvar_ph (__m256i __A, __m256h __B) +{ + return (__m256h) __builtin_ia32_permvarhi256_mask ((__v16hi) __B, + (__v16hi) __A, + (__v16hi) + (_mm256_setzero_ph ()), + (__mmask16)-1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_blend_ph (__mmask8 __U, __m128h __A, __m128h __W) +{ + return (__m128h) __builtin_ia32_movdquhi128_mask ((__v8hi) __W, + (__v8hi) __A, + (__mmask8) __U); + +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutex2var_ph (__m128h __A, __m128i __I, __m128h __B) +{ + return (__m128h) __builtin_ia32_vpermi2varhi128_mask ((__v8hi) __A, + (__v8hi) __I, + (__v8hi) __B, + (__mmask8)-1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutexvar_ph (__m128i __A, __m128h __B) +{ + return (__m128h) __builtin_ia32_permvarhi128_mask ((__v8hi) __B, + (__v8hi) __A, + (__v8hi) + (_mm_setzero_ph ()), + (__mmask8)-1); +} + #ifdef __DISABLE_AVX512FP16VL__ #undef __DISABLE_AVX512FP16VL__ #pragma GCC pop_options diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def index 5eae4d0..4c355c5 100644 --- a/gcc/config/i386/i386-builtin-types.def +++ b/gcc/config/i386/i386-builtin-types.def @@ -1348,6 +1348,7 @@ DEF_FUNCTION_TYPE (V8DI, V8HF, V8DI, UQI, INT) DEF_FUNCTION_TYPE (V8DF, V8HF, V8DF, UQI, INT) DEF_FUNCTION_TYPE (V8HF, V8DI, V8HF, UQI, INT) DEF_FUNCTION_TYPE (V8HF, V8DF, V8HF, UQI, INT) +DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF) DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF, UQI, INT) DEF_FUNCTION_TYPE (V8HF, V2DF, V8HF, V8HF, UQI, INT) DEF_FUNCTION_TYPE (V8HF, V4SF, V8HF, V8HF, UQI, INT) @@ -1358,12 +1359,14 @@ DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF) DEF_FUNCTION_TYPE (V16HI, V16HF, V16HI, UHI) DEF_FUNCTION_TYPE (V16HF, V16HI, V16HF, UHI) DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, UHI) +DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, V16HF) DEF_FUNCTION_TYPE (V16SI, V16HF, V16SI, UHI, INT) DEF_FUNCTION_TYPE (V16SF, V16HF, V16SF, UHI, INT) DEF_FUNCTION_TYPE (V16HF, V16HF, INT, V16HF, UHI) DEF_FUNCTION_TYPE (UHI, V16HF, V16HF, INT, UHI) DEF_FUNCTION_TYPE (V16HF, V16SI, V16HF, UHI, INT) DEF_FUNCTION_TYPE (V16HF, V16SF, V16HF, UHI, INT) +DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, V16HF, UQI) DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, V16HF, UHI) DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, USI) DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, INT) @@ -1371,7 +1374,9 @@ DEF_FUNCTION_TYPE (V32HI, V32HF, V32HI, USI, INT) DEF_FUNCTION_TYPE (V32HF, V32HI, V32HF, USI, INT) DEF_FUNCTION_TYPE (USI, V32HF, V32HF, INT, USI) DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, USI, INT) +DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, INT) DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI) DEF_FUNCTION_TYPE (USI, V32HF, V32HF, INT, USI, INT) +DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, UHI, INT) DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI, INT) DEF_FUNCTION_TYPE (V32HF, V32HF, INT, V32HF, USI, INT) diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index 5950d5e..302e1bc 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -2911,6 +2911,26 @@ BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_ BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fnmsub_v8hf_mask, "__builtin_ia32_vfnmsubph128_mask", IX86_BUILTIN_VFNMSUBPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fnmsub_v8hf_mask3, "__builtin_ia32_vfnmsubph128_mask3", IX86_BUILTIN_VFNMSUBPH128_MASK3, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fnmsub_v8hf_maskz, "__builtin_ia32_vfnmsubph128_maskz", IX86_BUILTIN_VFNMSUBPH128_MASKZ, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fmaddc_v8hf, "__builtin_ia32_vfmaddcph128", IX86_BUILTIN_VFMADDCPH_V8HF, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddc_v8hf_mask, "__builtin_ia32_vfmaddcph128_mask", IX86_BUILTIN_VFMADDCPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddc_v8hf_maskz, "__builtin_ia32_vfmaddcph128_maskz", IX86_BUILTIN_VFMADDCPH128_MASKZ, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fmaddc_v16hf, "__builtin_ia32_vfmaddcph256", IX86_BUILTIN_VFMADDCPH_V16HF, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fmaddc_v16hf_mask, "__builtin_ia32_vfmaddcph256_mask", IX86_BUILTIN_VFMADDCPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fmaddc_v16hf_maskz, "__builtin_ia32_vfmaddcph256_maskz", IX86_BUILTIN_VFMADDCPH256_MASKZ, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fcmaddc_v8hf, "__builtin_ia32_vfcmaddcph128", IX86_BUILTIN_VFCMADDCPH_V8HF, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddc_v8hf_mask, "__builtin_ia32_vfcmaddcph128_mask", IX86_BUILTIN_VFCMADDCPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddc_v8hf_maskz, "__builtin_ia32_vfcmaddcph128_maskz", IX86_BUILTIN_VFCMADDCPH128_MASKZ, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fcmaddc_v16hf, "__builtin_ia32_vfcmaddcph256", IX86_BUILTIN_VFCMADDCPH_V16HF, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fcmaddc_v16hf_mask, "__builtin_ia32_vfcmaddcph256_mask", IX86_BUILTIN_VFCMADDCPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fcmaddc_v16hf_maskz, "__builtin_ia32_vfcmaddcph256_maskz", IX86_BUILTIN_VFCMADDCPH256_MASKZ, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmulc_v8hf, "__builtin_ia32_vfcmulcph128", IX86_BUILTIN_VFCMULCPH_V8HF, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmulc_v8hf_mask, "__builtin_ia32_vfcmulcph128_mask", IX86_BUILTIN_VFCMULCPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fcmulc_v16hf, "__builtin_ia32_vfcmulcph256", IX86_BUILTIN_VFCMULCPH_V16HF, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fcmulc_v16hf_mask, "__builtin_ia32_vfcmulcph256_mask", IX86_BUILTIN_VFCMULCPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmulc_v8hf, "__builtin_ia32_vfmulcph128", IX86_BUILTIN_VFMULCPH_V8HF, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmulc_v8hf_mask, "__builtin_ia32_vfmulcph128_mask", IX86_BUILTIN_VFMULCPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fmulc_v16hf, "__builtin_ia32_vfmulcph256", IX86_BUILTIN_VFMULCPH_V16HF, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fmulc_v16hf_mask, "__builtin_ia32_vfmulcph256_mask", IX86_BUILTIN_VFMULCPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI) /* Builtins with rounding support. */ BDESC_END (ARGS, ROUND_ARGS) @@ -3201,6 +3221,26 @@ BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmfnmadd_v8hf_mask_round BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmfnmadd_v8hf_mask3_round, "__builtin_ia32_vfnmaddsh3_mask3", IX86_BUILTIN_VFNMADDSH3_MASK3, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmfnmadd_v8hf_maskz_round, "__builtin_ia32_vfnmaddsh3_maskz", IX86_BUILTIN_VFNMADDSH3_MASKZ, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmfmsub_v8hf_mask3_round, "__builtin_ia32_vfmsubsh3_mask3", IX86_BUILTIN_VFMSUBSH3_MASK3, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fmaddc_v32hf_round, "__builtin_ia32_vfmaddcph512_round", IX86_BUILTIN_VFMADDCPH512_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmaddc_v32hf_mask_round, "__builtin_ia32_vfmaddcph512_mask_round", IX86_BUILTIN_VFMADDCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmaddc_v32hf_maskz_round, "__builtin_ia32_vfmaddcph512_maskz_round", IX86_BUILTIN_VFMADDCPH512_MASKZ_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fcmaddc_v32hf_round, "__builtin_ia32_vfcmaddcph512_round", IX86_BUILTIN_VFCMADDCPH512_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmaddc_v32hf_mask_round, "__builtin_ia32_vfcmaddcph512_mask_round", IX86_BUILTIN_VFCMADDCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmaddc_v32hf_maskz_round, "__builtin_ia32_vfcmaddcph512_maskz_round", IX86_BUILTIN_VFCMADDCPH512_MASKZ_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmulc_v32hf_round, "__builtin_ia32_vfcmulcph512_round", IX86_BUILTIN_VFCMULCPH512_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmulc_v32hf_mask_round, "__builtin_ia32_vfcmulcph512_mask_round", IX86_BUILTIN_VFCMULCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmulc_v32hf_round, "__builtin_ia32_vfmulcph512_round", IX86_BUILTIN_VFMULCPH512_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmulc_v32hf_mask_round, "__builtin_ia32_vfmulcph512_mask_round", IX86_BUILTIN_VFMULCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fma_fcmaddcsh_v8hf_round, "__builtin_ia32_vfcmaddcsh_round", IX86_BUILTIN_VFCMADDCSH_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask_round, "__builtin_ia32_vfcmaddcsh_mask_round", IX86_BUILTIN_VFCMADDCSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddcsh_v8hf_maskz_round, "__builtin_ia32_vfcmaddcsh_maskz_round", IX86_BUILTIN_VFCMADDCSH_MASKZ_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fma_fmaddcsh_v8hf_round, "__builtin_ia32_vfmaddcsh_round", IX86_BUILTIN_VFMADDCSH_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask_round, "__builtin_ia32_vfmaddcsh_mask_round", IX86_BUILTIN_VFMADDCSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddcsh_v8hf_maskz_round, "__builtin_ia32_vfmaddcsh_maskz_round", IX86_BUILTIN_VFMADDCSH_MASKZ_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmulcsh_v8hf_round, "__builtin_ia32_vfcmulcsh_round", IX86_BUILTIN_VFCMULCSH_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmulcsh_v8hf_mask_round, "__builtin_ia32_vfcmulcsh_mask_round", IX86_BUILTIN_VFCMULCSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmulcsh_v8hf_round, "__builtin_ia32_vfmulcsh_round", IX86_BUILTIN_VFMULCSH_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmulcsh_v8hf_mask_round, "__builtin_ia32_vfmulcsh_mask_round", IX86_BUILTIN_VFMULCSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) BDESC_END (ROUND_ARGS, MULTI_ARG) diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index c88cb14..4780b99 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -3638,6 +3638,8 @@ ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode, return false; else if (vector_size == 64) return true; + else if (GET_MODE_INNER (cmp_mode) == HFmode) + return true; /* When op_true is NULL, op_false must be NULL, or vice versa. */ gcc_assert (!op_true == !op_false); @@ -9762,6 +9764,7 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V2DI_FTYPE_V8HF_V2DI_UQI: case V2DI_FTYPE_V4SF_V2DI_UQI: case V8HF_FTYPE_V8HF_V8HF_UQI: + case V8HF_FTYPE_V8HF_V8HF_V8HF: case V8HF_FTYPE_V8HI_V8HF_UQI: case V8HF_FTYPE_V8SI_V8HF_UQI: case V8HF_FTYPE_V8SF_V8HF_UQI: @@ -9840,6 +9843,7 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V16SF_FTYPE_V8SF_V16SF_UHI: case V16SI_FTYPE_V8SI_V16SI_UHI: case V16HF_FTYPE_V16HI_V16HF_UHI: + case V16HF_FTYPE_V16HF_V16HF_V16HF: case V16HI_FTYPE_V16HF_V16HI_UHI: case V16HI_FTYPE_V16HI_V16HI_UHI: case V8HI_FTYPE_V16QI_V8HI_UQI: @@ -9996,6 +10000,7 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI: case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI: case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI: + case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI: case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI: case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI: case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI: @@ -10725,6 +10730,7 @@ ix86_expand_round_builtin (const struct builtin_description *d, case V16SF_FTYPE_V16HF_V16SF_UHI_INT: case V32HF_FTYPE_V32HI_V32HF_USI_INT: case V32HF_FTYPE_V32HF_V32HF_USI_INT: + case V32HF_FTYPE_V32HF_V32HF_V32HF_INT: case V16SF_FTYPE_V16SF_V16SF_HI_INT: case V8DI_FTYPE_V8SF_V8DI_QI_INT: case V16SF_FTYPE_V16SI_V16SF_HI_INT: @@ -10754,6 +10760,7 @@ ix86_expand_round_builtin (const struct builtin_description *d, case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT: case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT: case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT: + case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT: case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT: case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT: case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT: @@ -16038,6 +16045,7 @@ emit_reduc_half (rtx dest, rtx src, int i) break; case E_V16QImode: case E_V8HImode: + case E_V8HFmode: case E_V4SImode: case E_V2DImode: d = gen_reg_rtx (V1TImode); @@ -16059,6 +16067,7 @@ emit_reduc_half (rtx dest, rtx src, int i) break; case E_V32QImode: case E_V16HImode: + case E_V16HFmode: case E_V8SImode: case E_V4DImode: if (i == 256) @@ -16078,6 +16087,7 @@ emit_reduc_half (rtx dest, rtx src, int i) break; case E_V64QImode: case E_V32HImode: + case E_V32HFmode: if (i < 64) { d = gen_reg_rtx (V4TImode); diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c index 14f816f..43bb676 100644 --- a/gcc/config/i386/i386-features.c +++ b/gcc/config/i386/i386-features.c @@ -2258,15 +2258,22 @@ remove_partial_avx_dependency (void) rtx zero; machine_mode dest_vecmode; - if (dest_mode == E_SFmode) + switch (dest_mode) { + case E_HFmode: + dest_vecmode = V8HFmode; + zero = gen_rtx_SUBREG (V8HFmode, v4sf_const0, 0); + break; + case E_SFmode: dest_vecmode = V4SFmode; zero = v4sf_const0; - } - else - { + break; + case E_DFmode: dest_vecmode = V2DFmode; zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0); + break; + default: + gcc_unreachable (); } /* Change source to vector mode. */ diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index ba89e11..a566d84 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -2462,6 +2462,8 @@ classify_argument (machine_mode mode, const_tree type, case E_V2SFmode: case E_V2SImode: case E_V4HImode: + case E_V4HFmode: + case E_V2HFmode: case E_V8QImode: classes[0] = X86_64_SSE_CLASS; return 1; @@ -2902,6 +2904,7 @@ pass_in_reg: case E_V8QImode: case E_V4HImode: + case E_V4HFmode: case E_V2SImode: case E_V2SFmode: case E_V1TImode: @@ -3149,6 +3152,7 @@ pass_in_reg: case E_V8QImode: case E_V4HImode: + case E_V4HFmode: case E_V2SImode: case E_V2SFmode: case E_V1TImode: @@ -5035,7 +5039,8 @@ standard_80387_constant_p (rtx x) /* For XFmode constants, try to find a special 80387 instruction when optimizing for size or on those CPUs that benefit from them. */ if (mode == XFmode - && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS)) + && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS) + && !flag_rounding_math) { int i; @@ -10703,24 +10708,19 @@ legitimate_pic_address_disp_p (rtx disp) if (is_imported_p (op0)) return true; - if (SYMBOL_REF_FAR_ADDR_P (op0) - || !SYMBOL_REF_LOCAL_P (op0)) + if (SYMBOL_REF_FAR_ADDR_P (op0) || !SYMBOL_REF_LOCAL_P (op0)) break; - /* Function-symbols need to be resolved only for - large-model. - For the small-model we don't need to resolve anything - here. */ + /* Non-external-weak function symbols need to be resolved only + for the large model. Non-external symbols don't need to be + resolved for large and medium models. For the small model, + we don't need to resolve anything here. */ if ((ix86_cmodel != CM_LARGE_PIC - && SYMBOL_REF_FUNCTION_P (op0)) + && SYMBOL_REF_FUNCTION_P (op0) + && !(SYMBOL_REF_EXTERNAL_P (op0) && SYMBOL_REF_WEAK (op0))) + || !SYMBOL_REF_EXTERNAL_P (op0) || ix86_cmodel == CM_SMALL_PIC) return true; - /* Non-external symbols don't need to be resolved for - large, and medium-model. */ - if ((ix86_cmodel == CM_LARGE_PIC - || ix86_cmodel == CM_MEDIUM_PIC) - && !SYMBOL_REF_EXTERNAL_P (op0)) - return true; } else if (!SYMBOL_REF_FAR_ADDR_P (op0) && (SYMBOL_REF_LOCAL_P (op0) diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 8a4251b..cba6d83 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1033,7 +1033,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); || (MODE) == TImode) #define VALID_AVX512FP16_REG_MODE(MODE) \ - ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode) + ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode \ + || (MODE) == V2HFmode) #define VALID_SSE2_REG_MODE(MODE) \ ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode \ @@ -1041,7 +1042,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); || (MODE) == V2DImode || (MODE) == DFmode || (MODE) == HFmode) #define VALID_SSE2_REG_VHF_MODE(MODE) \ - (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode) + (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode \ + || (MODE) == V4HFmode || (MODE) == V2HFmode) #define VALID_SSE_REG_MODE(MODE) \ ((MODE) == V1TImode || (MODE) == TImode \ @@ -1051,10 +1053,12 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); #define VALID_MMX_REG_MODE_3DNOW(MODE) \ ((MODE) == V2SFmode || (MODE) == SFmode) +/* To match ia32 psABI, V4HFmode should be added here. */ #define VALID_MMX_REG_MODE(MODE) \ ((MODE) == V1DImode || (MODE) == DImode \ || (MODE) == V2SImode || (MODE) == SImode \ - || (MODE) == V4HImode || (MODE) == V8QImode) + || (MODE) == V4HImode || (MODE) == V8QImode \ + || (MODE) == V4HFmode) #define VALID_MASK_REG_MODE(MODE) ((MODE) == HImode || (MODE) == QImode) @@ -1087,7 +1091,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); || (MODE) == V4DImode || (MODE) == V8SFmode || (MODE) == V4DFmode \ || (MODE) == V2TImode || (MODE) == V8DImode || (MODE) == V64QImode \ || (MODE) == V16SImode || (MODE) == V32HImode || (MODE) == V8DFmode \ - || (MODE) == V16SFmode || VALID_AVX512FP16_REG_MODE (MODE)) + || (MODE) == V16SFmode || (MODE) == V32HFmode || (MODE) == V16HFmode \ + || (MODE) == V8HFmode) #define X87_FLOAT_MODE_P(MODE) \ (TARGET_80387 && ((MODE) == SFmode || (MODE) == DFmode || (MODE) == XFmode)) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 188f431..04cb3bf 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -498,7 +498,7 @@ ;; Main data type used by the insn (define_attr "mode" "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,SF,DF,XF,TF,V32HF,V16HF,V8HF, - V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF" + V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF,V4HF,V2HF" (const_string "unknown")) ;; The CPU unit operations uses. @@ -832,7 +832,7 @@ x64_avx,x64_avx512bw,x64_avx512dq, sse_noavx,sse2,sse2_noavx,sse3,sse3_noavx,sse4,sse4_noavx, avx,noavx,avx2,noavx2,bmi,bmi2,fma4,fma,avx512f,noavx512f, - avx512bw,noavx512bw,avx512dq,noavx512dq, + avx512bw,noavx512bw,avx512dq,noavx512dq,fma_or_avx512vl, avx512vl,noavx512vl,avxvnni,avx512vnnivl,avx512fp16" (const_string "base")) @@ -874,6 +874,8 @@ (eq_attr "isa" "bmi2") (symbol_ref "TARGET_BMI2") (eq_attr "isa" "fma4") (symbol_ref "TARGET_FMA4") (eq_attr "isa" "fma") (symbol_ref "TARGET_FMA") + (eq_attr "isa" "fma_or_avx512vl") + (symbol_ref "TARGET_FMA || TARGET_AVX512VL") (eq_attr "isa" "avx512f") (symbol_ref "TARGET_AVX512F") (eq_attr "isa" "noavx512f") (symbol_ref "!TARGET_AVX512F") (eq_attr "isa" "avx512bw") (symbol_ref "TARGET_AVX512BW") @@ -1104,7 +1106,8 @@ (V1TI "16") (V2TI "32") (V4TI "64") (V2DF "16") (V4DF "32") (V8DF "64") (V4SF "16") (V8SF "32") (V16SF "64") - (V8HF "16") (V16HF "32") (V32HF "64")]) + (V8HF "16") (V16HF "32") (V32HF "64") + (V4HF "8") (V2HF "4")]) ;; Double word integer modes as mode attribute. (define_mode_attr DWI [(QI "HI") (HI "SI") (SI "DI") (DI "TI") (TI "OI")]) @@ -1541,6 +1544,21 @@ DONE; }) +(define_expand "cstorehf4" + [(set (reg:CC FLAGS_REG) + (compare:CC (match_operand:HF 2 "cmp_fp_expander_operand") + (match_operand:HF 3 "cmp_fp_expander_operand"))) + (set (match_operand:QI 0 "register_operand") + (match_operator 1 "ix86_fp_comparison_operator" + [(reg:CC FLAGS_REG) + (const_int 0)]))] + "TARGET_AVX512FP16" +{ + ix86_expand_setcc (operands[0], GET_CODE (operands[1]), + operands[2], operands[3]); + DONE; +}) + (define_expand "cstore<mode>4" [(set (reg:CC FLAGS_REG) (compare:CC (match_operand:MODEF 2 "cmp_fp_expander_operand") @@ -4793,6 +4811,16 @@ } }) +(define_insn "fix<fixunssuffix>_trunchf<mode>2" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (any_fix:SWI48 + (match_operand:HF 1 "nonimmediate_operand" "vm")))] + "TARGET_AVX512FP16" + "vcvttsh2<fixsuffix>si\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "prefix" "evex") + (set_attr "mode" "<MODE>")]) + ;; Signed conversion to SImode. (define_expand "fix_truncxfsi2" @@ -4900,6 +4928,17 @@ (set_attr "prefix" "evex") (set_attr "mode" "SI")]) +(define_insn "*fixuns_trunchfsi2zext" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (unsigned_fix:SI + (match_operand:HF 1 "nonimmediate_operand" "vm"))))] + "TARGET_64BIT && TARGET_AVX512FP16" + "vcvttsh2usi\t{%1, %k0|%k0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "prefix" "evex") + (set_attr "mode" "SI")]) + (define_insn "*fixuns_trunc<mode>si2_avx512f_zext" [(set (match_operand:DI 0 "register_operand" "=r") (zero_extend:DI @@ -4932,6 +4971,14 @@ ;; Without these patterns, we'll try the unsigned SI conversion which ;; is complex for SSE, rather than the signed SI conversion, which isn't. +(define_expand "fixuns_trunchfhi2" + [(set (match_dup 2) + (fix:SI (match_operand:HF 1 "nonimmediate_operand"))) + (set (match_operand:HI 0 "nonimmediate_operand") + (subreg:HI (match_dup 2) 0))] + "TARGET_AVX512FP16" + "operands[2] = gen_reg_rtx (SImode);") + (define_expand "fixuns_trunc<mode>hi2" [(set (match_dup 2) (fix:SI (match_operand:MODEF 1 "nonimmediate_operand"))) @@ -10163,6 +10210,40 @@ [(set_attr "type" "alu") (set_attr "mode" "<MODE>")]) +;; convert (sign_extend:WIDE (any_logic:NARROW (memory, immediate))) +;; to (any_logic:WIDE (sign_extend (memory)), (sign_extend (immediate))). +;; This eliminates sign extension after logic operation. + +(define_split + [(set (match_operand:SWI248 0 "register_operand") + (sign_extend:SWI248 + (any_logic:QI (match_operand:QI 1 "memory_operand") + (match_operand:QI 2 "const_int_operand"))))] + "" + [(set (match_dup 3) (sign_extend:SWI248 (match_dup 1))) + (set (match_dup 0) (any_logic:SWI248 (match_dup 3) (match_dup 2)))] + "operands[3] = gen_reg_rtx (<MODE>mode);") + +(define_split + [(set (match_operand:SWI48 0 "register_operand") + (sign_extend:SWI48 + (any_logic:HI (match_operand:HI 1 "memory_operand") + (match_operand:HI 2 "const_int_operand"))))] + "" + [(set (match_dup 3) (sign_extend:SWI48 (match_dup 1))) + (set (match_dup 0) (any_logic:SWI48 (match_dup 3) (match_dup 2)))] + "operands[3] = gen_reg_rtx (<MODE>mode);") + +(define_split + [(set (match_operand:DI 0 "register_operand") + (sign_extend:DI + (any_logic:SI (match_operand:SI 1 "memory_operand") + (match_operand:SI 2 "const_int_operand"))))] + "TARGET_64BIT" + [(set (match_dup 3) (sign_extend:DI (match_dup 1))) + (set (match_dup 0) (any_logic:DI (match_dup 3) (match_dup 2)))] + "operands[3] = gen_reg_rtx (DImode);") + (define_insn "*<code><mode>_2" [(set (reg FLAGS_REG) (compare (any_or:SWI @@ -17041,6 +17122,19 @@ DONE; }) +(define_insn "sqrthf2" + [(set (match_operand:HF 0 "register_operand" "=v,v") + (sqrt:HF + (match_operand:HF 1 "nonimmediate_operand" "v,m")))] + "TARGET_AVX512FP16" + "@ + vsqrtsh\t{%d1, %0|%0, %d1} + vsqrtsh\t{%1, %d0|%d0, %1}" + [(set_attr "type" "sse") + (set_attr "prefix" "evex") + (set_attr "avx_partial_xmm_update" "false,true") + (set_attr "mode" "HF")]) + (define_insn "*sqrt<mode>2_sse" [(set (match_operand:MODEF 0 "register_operand" "=v,v,v") (sqrt:MODEF @@ -18220,9 +18314,9 @@ (define_insn "sse4_1_round<mode>2" - [(set (match_operand:MODEF 0 "register_operand" "=x,x,x,v,v") - (unspec:MODEF - [(match_operand:MODEF 1 "nonimmediate_operand" "0,x,m,v,m") + [(set (match_operand:MODEFH 0 "register_operand" "=x,x,x,v,v") + (unspec:MODEFH + [(match_operand:MODEFH 1 "nonimmediate_operand" "0,x,m,v,m") (match_operand:SI 2 "const_0_to_15_operand" "n,n,n,n,n")] UNSPEC_ROUND))] "TARGET_SSE4_1" @@ -18257,6 +18351,17 @@ (set_attr "znver1_decode" "vector") (set_attr "mode" "XF")]) +(define_expand "rinthf2" + [(match_operand:HF 0 "register_operand") + (match_operand:HF 1 "nonimmediate_operand")] + "TARGET_AVX512FP16" +{ + emit_insn (gen_sse4_1_roundhf2 (operands[0], + operands[1], + GEN_INT (ROUND_MXCSR))); + DONE; +}) + (define_expand "rint<mode>2" [(use (match_operand:MODEF 0 "register_operand")) (use (match_operand:MODEF 1 "nonimmediate_operand"))] @@ -18290,6 +18395,17 @@ "TARGET_USE_FANCY_MATH_387 && !flag_trapping_math") +(define_expand "nearbyinthf2" + [(match_operand:HF 0 "register_operand") + (match_operand:HF 1 "nonimmediate_operand")] + "TARGET_AVX512FP16" +{ + emit_insn (gen_sse4_1_roundhf2 (operands[0], + operands[1], + GEN_INT (ROUND_MXCSR | ROUND_NO_EXC))); + DONE; +}) + (define_expand "nearbyint<mode>2" [(use (match_operand:MODEF 0 "register_operand")) (use (match_operand:MODEF 1 "nonimmediate_operand"))] @@ -18479,6 +18595,18 @@ "TARGET_USE_FANCY_MATH_387 && (flag_fp_int_builtin_inexact || !flag_trapping_math)") +(define_expand "<rounding_insn>hf2" + [(parallel [(set (match_operand:HF 0 "register_operand") + (unspec:HF [(match_operand:HF 1 "register_operand")] + FRNDINT_ROUNDING)) + (clobber (reg:CC FLAGS_REG))])] + "TARGET_AVX512FP16" +{ + emit_insn (gen_sse4_1_roundhf2 (operands[0], operands[1], + GEN_INT (ROUND_<ROUNDING> | ROUND_NO_EXC))); + DONE; +}) + (define_expand "<rounding_insn><mode>2" [(parallel [(set (match_operand:MODEF 0 "register_operand") (unspec:MODEF [(match_operand:MODEF 1 "register_operand")] @@ -19882,6 +20010,17 @@ (set_attr "type" "sseadd") (set_attr "mode" "<MODE>")]) +(define_insn "<code>hf3" + [(set (match_operand:HF 0 "register_operand" "=v") + (smaxmin:HF + (match_operand:HF 1 "nonimmediate_operand" "%v") + (match_operand:HF 2 "nonimmediate_operand" "vm")))] + "TARGET_AVX512FP16" + "v<maxmin_float>sh\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "prefix" "evex") + (set_attr "type" "sseadd") + (set_attr "mode" "HF")]) + ;; These versions of the min/max patterns implement exactly the operations ;; min = (op1 < op2 ? op1 : op2) ;; max = (!(op1 < op2) ? op1 : op2) diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 2d3b63f..c9467bc 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -48,7 +48,7 @@ (define_mode_iterator MMXMODEI8 [V8QI V4HI V2SI (V1DI "TARGET_SSE2")]) ;; All 8-byte vector modes handled by MMX -(define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF]) +(define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF V4HF]) (define_mode_iterator MMXMODE124 [V8QI V4HI V2SI V2SF]) ;; Mix-n-match @@ -57,8 +57,8 @@ (define_mode_iterator MMXMODE24 [V4HI V2SI]) (define_mode_iterator MMXMODE248 [V4HI V2SI V1DI]) -;; All 4-byte integer vector modes -(define_mode_iterator V_32 [V4QI V2HI V1SI]) +;; All 4-byte integer/float16 vector modes +(define_mode_iterator V_32 [V4QI V2HI V1SI V2HF]) ;; 4-byte integer vector modes (define_mode_iterator VI_32 [V4QI V2HI]) @@ -66,6 +66,9 @@ ;; V2S* modes (define_mode_iterator V2FI [V2SF V2SI]) +;; 4-byte and 8-byte float16 vector modes +(define_mode_iterator VHF_32_64 [V4HF V2HF]) + ;; Mapping from integer vector mode to mnemonic suffix (define_mode_attr mmxvecsize [(V8QI "b") (V4QI "b") (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")]) @@ -191,6 +194,8 @@ (eq_attr "alternative" "11,12") (cond [(match_test "<MODE>mode == V2SFmode") (const_string "V4SF") + (match_test "<MODE>mode == V4HFmode") + (const_string "V4SF") (ior (not (match_test "TARGET_SSE2")) (match_test "optimize_function_for_size_p (cfun)")) (const_string "V4SF") @@ -198,14 +203,16 @@ (const_string "TI")) (and (eq_attr "alternative" "13") - (ior (and (match_test "<MODE>mode == V2SFmode") - (not (match_test "TARGET_MMX_WITH_SSE"))) - (not (match_test "TARGET_SSE2")))) + (ior (ior (and (match_test "<MODE>mode == V2SFmode") + (not (match_test "TARGET_MMX_WITH_SSE"))) + (not (match_test "TARGET_SSE2"))) + (match_test "<MODE>mode == V4HFmode"))) (const_string "V2SF") (and (eq_attr "alternative" "14") - (ior (match_test "<MODE>mode == V2SFmode") - (not (match_test "TARGET_SSE2")))) + (ior (ior (match_test "<MODE>mode == V2SFmode") + (not (match_test "TARGET_SSE2"))) + (match_test "<MODE>mode == V4HFmode"))) (const_string "V2SF") ] (const_string "DI"))) @@ -289,12 +296,17 @@ (const_string "*"))) (set (attr "mode") (cond [(eq_attr "alternative" "2,3") - (cond [(match_test "TARGET_AVX") + (cond [(match_test "<MODE>mode == V2HFmode") + (const_string "V4SF") + (match_test "TARGET_AVX") (const_string "TI") (match_test "optimize_function_for_size_p (cfun)") (const_string "V4SF") ] (const_string "TI")) + (and (eq_attr "alternative" "4,5") + (match_test "<MODE>mode == V2HFmode")) + (const_string "SF") ] (const_string "SI"))) (set (attr "preferred_for_speed") @@ -1019,12 +1031,13 @@ (match_operand:V2SF 1 "register_operand" "%0,v,x") (match_operand:V2SF 2 "register_operand" "v,v,x") (match_operand:V2SF 3 "register_operand" "v,0,x")))] - "(TARGET_FMA || TARGET_FMA4) && TARGET_MMX_WITH_SSE" + "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL) + && TARGET_MMX_WITH_SSE" "@ vfmadd132ps\t{%2, %3, %0|%0, %3, %2} vfmadd231ps\t{%2, %1, %0|%0, %1, %2} vfmaddps\t{%3, %2, %1, %0|%0, %1, %2, %3}" - [(set_attr "isa" "fma,fma,fma4") + [(set_attr "isa" "fma_or_avx512vl,fma_or_avx512vl,fma4") (set_attr "type" "ssemuladd") (set_attr "mode" "V4SF")]) @@ -1035,12 +1048,13 @@ (match_operand:V2SF 2 "register_operand" "v,v,x") (neg:V2SF (match_operand:V2SF 3 "register_operand" "v,0,x"))))] - "(TARGET_FMA || TARGET_FMA4) && TARGET_MMX_WITH_SSE" + "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL) + && TARGET_MMX_WITH_SSE" "@ vfmsub132ps\t{%2, %3, %0|%0, %3, %2} vfmsub231ps\t{%2, %1, %0|%0, %1, %2} vfmsubps\t{%3, %2, %1, %0|%0, %1, %2, %3}" - [(set_attr "isa" "fma,fma,fma4") + [(set_attr "isa" "fma_or_avx512vl,fma_or_avx512vl,fma4") (set_attr "type" "ssemuladd") (set_attr "mode" "V4SF")]) @@ -1051,12 +1065,13 @@ (match_operand:V2SF 1 "register_operand" "%0,v,x")) (match_operand:V2SF 2 "register_operand" "v,v,x") (match_operand:V2SF 3 "register_operand" "v,0,x")))] - "(TARGET_FMA || TARGET_FMA4) && TARGET_MMX_WITH_SSE" + "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL) + && TARGET_MMX_WITH_SSE" "@ vfnmadd132ps\t{%2, %3, %0|%0, %3, %2} vfnmadd231ps\t{%2, %1, %0|%0, %1, %2} vfnmaddps\t{%3, %2, %1, %0|%0, %1, %2, %3}" - [(set_attr "isa" "fma,fma,fma4") + [(set_attr "isa" "fma_or_avx512vl,fma_or_avx512vl,fma4") (set_attr "type" "ssemuladd") (set_attr "mode" "V4SF")]) @@ -1068,12 +1083,13 @@ (match_operand:V2SF 2 "register_operand" "v,v,x") (neg:V2SF (match_operand:V2SF 3 "register_operand" "v,0,x"))))] - "(TARGET_FMA || TARGET_FMA4) && TARGET_MMX_WITH_SSE" + "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL) + && TARGET_MMX_WITH_SSE" "@ vfnmsub132ps\t{%2, %3, %0|%0, %3, %2} vfnmsub231ps\t{%2, %1, %0|%0, %1, %2} vfnmsubps\t{%3, %2, %1, %0|%0, %1, %2, %3}" - [(set_attr "isa" "fma,fma,fma4") + [(set_attr "isa" "fma_or_avx512vl,fma_or_avx512vl,fma4") (set_attr "type" "ssemuladd") (set_attr "mode" "V4SF")]) @@ -1389,6 +1405,28 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; +;; Parallel half-precision floating point arithmetic +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn "<insn><mode>3" + [(set (match_operand:VHF_32_64 0 "register_operand" "=v") + (plusminusmultdiv:VHF_32_64 + (match_operand:VHF_32_64 1 "register_operand" "<comm>v") + (match_operand:VHF_32_64 2 "register_operand" "v")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "v<insn>ph\t{%2, %1, %0|%0, %1, %2}" + [(set (attr "type") + (cond [(match_test "<CODE> == MULT") + (const_string "ssemul") + (match_test "<CODE> == DIV") + (const_string "ssediv")] + (const_string "sseadd"))) + (set_attr "prefix" "evex") + (set_attr "mode" "V8HF")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ;; Parallel integral arithmetic ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 0016c02..4559b0c 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -191,6 +191,14 @@ UNSPEC_VCVTNE2PS2BF16 UNSPEC_VCVTNEPS2BF16 UNSPEC_VDPBF16PS + + ;; For AVX512FP16 suppport + UNSPEC_COMPLEX_FMA + UNSPEC_COMPLEX_FCMA + UNSPEC_COMPLEX_FMUL + UNSPEC_COMPLEX_FCMUL + UNSPEC_COMPLEX_MASK + ]) (define_c_enum "unspecv" [ @@ -939,6 +947,10 @@ (V16SF "HI") (V8SF "QI") (V4SF "QI") (V8DF "QI") (V4DF "QI") (V2DF "QI")]) +;; Mapping of vector modes to corresponding complex mask size +(define_mode_attr avx512fmaskcmode + [(V32HF "HI") (V16HF "QI") (V8HF "QI")]) + ;; Mapping of vector modes to corresponding mask size (define_mode_attr avx512fmaskmodelower [(V64QI "di") (V32QI "si") (V16QI "hi") @@ -977,9 +989,9 @@ (V16HF "OI") (V8HF "TI")]) (define_mode_attr sseintvecmodelower - [(V16SF "v16si") (V8DF "v8di") - (V8SF "v8si") (V4DF "v4di") - (V4SF "v4si") (V2DF "v2di") + [(V32HF "v32hi") (V16SF "v16si") (V8DF "v8di") + (V16HF "v16hi") (V8SF "v8si") (V4DF "v4di") + (V8HF "v8hi") (V4SF "v4si") (V2DF "v2di") (V8SI "v8si") (V4DI "v4di") (V4SI "v4si") (V2DI "v2di") (V16HI "v16hi") (V8HI "v8hi") @@ -1022,6 +1034,13 @@ (V8DI "V8HF") (V4DI "V8HF") (V2DI "V8HF") (V8DF "V8HF") (V16SF "V16HF") (V8SF "V8HF")]) +;; Mapping of vector modes to vector hf modes of same element. +(define_mode_attr ssePHmodelower + [(V32HI "v32hf") (V16HI "v16hf") (V8HI "v8hf") + (V16SI "v16hf") (V8SI "v8hf") (V4SI "v4hf") + (V8DI "v8hf") (V4DI "v4hf") (V2DI "v2hf") + (V8DF "v8hf") (V16SF "v16hf") (V8SF "v8hf")]) + ;; Mapping of vector modes to packed single mode of the same size (define_mode_attr ssePSmode [(V16SI "V16SF") (V8DF "V16SF") @@ -1549,9 +1568,9 @@ (set_attr "mode" "<sseinsnmode>")]) (define_insn "<avx512>_store<mode>_mask" - [(set (match_operand:VI12_AVX512VL 0 "memory_operand" "=m") - (vec_merge:VI12_AVX512VL - (match_operand:VI12_AVX512VL 1 "register_operand" "v") + [(set (match_operand:VI12HF_AVX512VL 0 "memory_operand" "=m") + (vec_merge:VI12HF_AVX512VL + (match_operand:VI12HF_AVX512VL 1 "register_operand" "v") (match_dup 0) (match_operand:<avx512fmaskmode> 2 "register_operand" "Yk")))] "TARGET_AVX512BW" @@ -2106,12 +2125,12 @@ [(set_attr "isa" "noavx,noavx,avx,avx")]) (define_expand "cond_<insn><mode>" - [(set (match_operand:VF 0 "register_operand") - (vec_merge:VF - (plusminus:VF - (match_operand:VF 2 "vector_operand") - (match_operand:VF 3 "vector_operand")) - (match_operand:VF 4 "nonimm_or_0_operand") + [(set (match_operand:VFH 0 "register_operand") + (vec_merge:VFH + (plusminus:VFH + (match_operand:VFH 2 "vector_operand") + (match_operand:VFH 3 "vector_operand")) + (match_operand:VFH 4 "nonimm_or_0_operand") (match_operand:<avx512fmaskmode> 1 "register_operand")))] "<MODE_SIZE> == 64 || TARGET_AVX512VL" { @@ -2195,12 +2214,12 @@ (set_attr "mode" "<ssescalarmode>")]) (define_expand "cond_mul<mode>" - [(set (match_operand:VF 0 "register_operand") - (vec_merge:VF - (mult:VF - (match_operand:VF 2 "vector_operand") - (match_operand:VF 3 "vector_operand")) - (match_operand:VF 4 "nonimm_or_0_operand") + [(set (match_operand:VFH 0 "register_operand") + (vec_merge:VFH + (mult:VFH + (match_operand:VFH 2 "vector_operand") + (match_operand:VFH 3 "vector_operand")) + (match_operand:VFH 4 "nonimm_or_0_operand") (match_operand:<avx512fmaskmode> 1 "register_operand")))] "<MODE_SIZE> == 64 || TARGET_AVX512VL" { @@ -2310,12 +2329,12 @@ }) (define_expand "cond_div<mode>" - [(set (match_operand:VF 0 "register_operand") - (vec_merge:VF - (div:VF - (match_operand:VF 2 "register_operand") - (match_operand:VF 3 "vector_operand")) - (match_operand:VF 4 "nonimm_or_0_operand") + [(set (match_operand:VFH 0 "register_operand") + (vec_merge:VFH + (div:VFH + (match_operand:VFH 2 "register_operand") + (match_operand:VFH 3 "vector_operand")) + (match_operand:VFH 4 "nonimm_or_0_operand") (match_operand:<avx512fmaskmode> 1 "register_operand")))] "<MODE_SIZE> == 64 || TARGET_AVX512VL" { @@ -2510,12 +2529,12 @@ (set_attr "mode" "<ssescalarmode>")]) (define_insn "*<sse>_vmsqrt<mode>2<mask_scalar_name><round_scalar_name>" - [(set (match_operand:VF_128 0 "register_operand" "=x,v") - (vec_merge:VF_128 - (vec_duplicate:VF_128 + [(set (match_operand:VFH_128 0 "register_operand" "=x,v") + (vec_merge:VFH_128 + (vec_duplicate:VFH_128 (sqrt:<ssescalarmode> (match_operand:<ssescalarmode> 1 "nonimmediate_operand" "xm,<round_scalar_constraint>"))) - (match_operand:VF_128 2 "register_operand" "0,v") + (match_operand:VFH_128 2 "register_operand" "0,v") (const_int 1)))] "TARGET_SSE" "@ @@ -2648,12 +2667,12 @@ (set_attr "mode" "HF")]) (define_expand "cond_<code><mode>" - [(set (match_operand:VF 0 "register_operand") - (vec_merge:VF - (smaxmin:VF - (match_operand:VF 2 "vector_operand") - (match_operand:VF 3 "vector_operand")) - (match_operand:VF 4 "nonimm_or_0_operand") + [(set (match_operand:VFH 0 "register_operand") + (vec_merge:VFH + (smaxmin:VFH + (match_operand:VFH 2 "vector_operand") + (match_operand:VFH 3 "vector_operand")) + (match_operand:VFH 4 "nonimm_or_0_operand") (match_operand:<avx512fmaskmode> 1 "register_operand")))] "<MODE_SIZE> == 64 || TARGET_AVX512VL" { @@ -3137,36 +3156,20 @@ (set_attr "prefix_rep" "1,*") (set_attr "mode" "V4SF")]) -(define_expand "reduc_plus_scal_v4sf" - [(plus:V4SF - (match_operand:SF 0 "register_operand") - (match_operand:V4SF 1 "register_operand"))] - "TARGET_SSE" -{ - rtx vtmp = gen_reg_rtx (V4SFmode); - rtx stmp = gen_reg_rtx (SFmode); - if (TARGET_SSE3) - emit_insn (gen_sse3_movshdup (vtmp, operands[1])); - else - emit_insn (gen_sse_shufps (vtmp, operands[1], operands[1], GEN_INT(177))); - - emit_insn (gen_addv4sf3 (operands[1], operands[1], vtmp)); - emit_insn (gen_sse_movhlps (vtmp, vtmp, operands[1])); - emit_insn (gen_vec_extractv4sfsf (stmp, vtmp, const0_rtx)); - emit_insn (gen_vec_extractv4sfsf (operands[0], operands[1], const0_rtx)); - emit_insn (gen_addsf3 (operands[0], operands[0], stmp)); - DONE; -}) +(define_mode_iterator REDUC_SSE_PLUS_MODE + [(V2DF "TARGET_SSE") (V4SF "TARGET_SSE") + (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")]) -(define_expand "reduc_plus_scal_v2df" - [(plus:V2DF - (match_operand:DF 0 "register_operand") - (match_operand:V2DF 1 "register_operand"))] - "TARGET_SSE" +(define_expand "reduc_plus_scal_<mode>" + [(plus:REDUC_SSE_PLUS_MODE + (match_operand:<ssescalarmode> 0 "register_operand") + (match_operand:REDUC_SSE_PLUS_MODE 1 "register_operand"))] + "" { - rtx tmp = gen_reg_rtx (V2DFmode); - ix86_expand_reduc (gen_addv2df3, tmp, operands[1]); - emit_insn (gen_vec_extractv2dfdf (operands[0], tmp, const0_rtx)); + rtx tmp = gen_reg_rtx (<MODE>mode); + ix86_expand_reduc (gen_add<mode>3, tmp, operands[1]); + emit_insn (gen_vec_extract<mode><ssescalarmodelower> (operands[0], tmp, + const0_rtx)); DONE; }) @@ -3192,7 +3195,9 @@ (define_mode_iterator REDUC_PLUS_MODE [(V4DF "TARGET_AVX") (V8SF "TARGET_AVX") + (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") (V8DF "TARGET_AVX512F") (V16SF "TARGET_AVX512F") + (V32HF "TARGET_AVX512FP16 && TARGET_AVX512VL") (V32QI "TARGET_AVX") (V64QI "TARGET_AVX512F")]) (define_expand "reduc_plus_scal_<mode>" @@ -3212,7 +3217,8 @@ ;; Modes handled by reduc_sm{in,ax}* patterns. (define_mode_iterator REDUC_SSE_SMINMAX_MODE - [(V4SF "TARGET_SSE") (V2DF "TARGET_SSE") + [(V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL") + (V4SF "TARGET_SSE") (V2DF "TARGET_SSE") (V4SI "TARGET_SSE2") (V8HI "TARGET_SSE2") (V16QI "TARGET_SSE2") (V2DI "TARGET_SSE4_2")]) @@ -3231,9 +3237,11 @@ (define_mode_iterator REDUC_SMINMAX_MODE [(V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2") + (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2") (V8SF "TARGET_AVX") (V4DF "TARGET_AVX") (V64QI "TARGET_AVX512BW") + (V32HF "TARGET_AVX512FP16 && TARGET_AVX512VL") (V32HI "TARGET_AVX512BW") (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F") (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")]) @@ -3791,8 +3799,8 @@ (define_expand "vec_cmp<mode><avx512fmaskmodelower>" [(set (match_operand:<avx512fmaskmode> 0 "register_operand") (match_operator:<avx512fmaskmode> 1 "" - [(match_operand:V48_AVX512VL 2 "register_operand") - (match_operand:V48_AVX512VL 3 "nonimmediate_operand")]))] + [(match_operand:V48H_AVX512VL 2 "register_operand") + (match_operand:V48H_AVX512VL 3 "nonimmediate_operand")]))] "TARGET_AVX512F" { bool ok = ix86_expand_mask_vec_cmp (operands[0], GET_CODE (operands[1]), @@ -3999,6 +4007,51 @@ DONE; }) +(define_expand "vcond<mode><mode>" + [(set (match_operand:VF_AVX512FP16VL 0 "register_operand") + (if_then_else:VF_AVX512FP16VL + (match_operator 3 "" + [(match_operand:VF_AVX512FP16VL 4 "vector_operand") + (match_operand:VF_AVX512FP16VL 5 "vector_operand")]) + (match_operand:VF_AVX512FP16VL 1 "general_operand") + (match_operand:VF_AVX512FP16VL 2 "general_operand")))] + "TARGET_AVX512FP16" +{ + bool ok = ix86_expand_fp_vcond (operands); + gcc_assert (ok); + DONE; +}) + +(define_expand "vcond<mode><sseintvecmodelower>" + [(set (match_operand:VF_AVX512FP16VL 0 "register_operand") + (if_then_else:VF_AVX512FP16VL + (match_operator 3 "" + [(match_operand:<sseintvecmode> 4 "vector_operand") + (match_operand:<sseintvecmode> 5 "vector_operand")]) + (match_operand:VF_AVX512FP16VL 1 "general_operand") + (match_operand:VF_AVX512FP16VL 2 "general_operand")))] + "TARGET_AVX512FP16" +{ + bool ok = ix86_expand_int_vcond (operands); + gcc_assert (ok); + DONE; +}) + +(define_expand "vcond<sseintvecmodelower><mode>" + [(set (match_operand:<sseintvecmode> 0 "register_operand") + (if_then_else:<sseintvecmode> + (match_operator 3 "" + [(match_operand:VF_AVX512FP16VL 4 "vector_operand") + (match_operand:VF_AVX512FP16VL 5 "vector_operand")]) + (match_operand:<sseintvecmode> 1 "general_operand") + (match_operand:<sseintvecmode> 2 "general_operand")))] + "TARGET_AVX512FP16" +{ + bool ok = ix86_expand_fp_vcond (operands); + gcc_assert (ok); + DONE; +}) + (define_expand "vcond_mask_<mode><avx512fmaskmodelower>" [(set (match_operand:V48_AVX512VL 0 "register_operand") (vec_merge:V48_AVX512VL @@ -4008,10 +4061,10 @@ "TARGET_AVX512F") (define_expand "vcond_mask_<mode><avx512fmaskmodelower>" - [(set (match_operand:VI12_AVX512VL 0 "register_operand") - (vec_merge:VI12_AVX512VL - (match_operand:VI12_AVX512VL 1 "nonimmediate_operand") - (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand") + [(set (match_operand:VI12HF_AVX512VL 0 "register_operand") + (vec_merge:VI12HF_AVX512VL + (match_operand:VI12HF_AVX512VL 1 "nonimmediate_operand") + (match_operand:VI12HF_AVX512VL 2 "nonimm_or_0_operand") (match_operand:<avx512fmaskmode> 3 "register_operand")))] "TARGET_AVX512BW") @@ -4638,7 +4691,11 @@ (V8SF "TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL") (V4DF "TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL") (V16SF "TARGET_AVX512F") - (V8DF "TARGET_AVX512F")]) + (V8DF "TARGET_AVX512F") + (HF "TARGET_AVX512FP16") + (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL") + (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") + (V32HF "TARGET_AVX512FP16")]) (define_expand "fma<mode>4" [(set (match_operand:FMAMODEM 0 "register_operand") @@ -4746,14 +4803,11 @@ (set_attr "mode" "<MODE>")]) ;; Suppose AVX-512F as baseline -(define_mode_iterator VF_SF_AVX512VL - [SF V16SF (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL") - DF V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")]) - (define_mode_iterator VFH_SF_AVX512VL [(V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL") + (HF "TARGET_AVX512FP16") SF V16SF (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL") DF V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")]) @@ -4772,13 +4826,13 @@ (set_attr "mode" "<MODE>")]) (define_expand "cond_fma<mode>" - [(set (match_operand:VF_AVX512VL 0 "register_operand") - (vec_merge:VF_AVX512VL - (fma:VF_AVX512VL - (match_operand:VF_AVX512VL 2 "vector_operand") - (match_operand:VF_AVX512VL 3 "vector_operand") - (match_operand:VF_AVX512VL 4 "vector_operand")) - (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand") + [(set (match_operand:VFH_AVX512VL 0 "register_operand") + (vec_merge:VFH_AVX512VL + (fma:VFH_AVX512VL + (match_operand:VFH_AVX512VL 2 "vector_operand") + (match_operand:VFH_AVX512VL 3 "vector_operand") + (match_operand:VFH_AVX512VL 4 "vector_operand")) + (match_operand:VFH_AVX512VL 5 "nonimm_or_0_operand") (match_operand:<avx512fmaskmode> 1 "register_operand")))] "TARGET_AVX512F" { @@ -4872,14 +4926,14 @@ (set_attr "mode" "<MODE>")]) (define_expand "cond_fms<mode>" - [(set (match_operand:VF_AVX512VL 0 "register_operand") - (vec_merge:VF_AVX512VL - (fma:VF_AVX512VL - (match_operand:VF_AVX512VL 2 "vector_operand") - (match_operand:VF_AVX512VL 3 "vector_operand") - (neg:VF_AVX512VL - (match_operand:VF_AVX512VL 4 "vector_operand"))) - (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand") + [(set (match_operand:VFH_AVX512VL 0 "register_operand") + (vec_merge:VFH_AVX512VL + (fma:VFH_AVX512VL + (match_operand:VFH_AVX512VL 2 "vector_operand") + (match_operand:VFH_AVX512VL 3 "vector_operand") + (neg:VFH_AVX512VL + (match_operand:VFH_AVX512VL 4 "vector_operand"))) + (match_operand:VFH_AVX512VL 5 "nonimm_or_0_operand") (match_operand:<avx512fmaskmode> 1 "register_operand")))] "TARGET_AVX512F" { @@ -4975,14 +5029,14 @@ (set_attr "mode" "<MODE>")]) (define_expand "cond_fnma<mode>" - [(set (match_operand:VF_AVX512VL 0 "register_operand") - (vec_merge:VF_AVX512VL - (fma:VF_AVX512VL - (neg:VF_AVX512VL - (match_operand:VF_AVX512VL 2 "vector_operand")) - (match_operand:VF_AVX512VL 3 "vector_operand") - (match_operand:VF_AVX512VL 4 "vector_operand")) - (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand") + [(set (match_operand:VFH_AVX512VL 0 "register_operand") + (vec_merge:VFH_AVX512VL + (fma:VFH_AVX512VL + (neg:VFH_AVX512VL + (match_operand:VFH_AVX512VL 2 "vector_operand")) + (match_operand:VFH_AVX512VL 3 "vector_operand") + (match_operand:VFH_AVX512VL 4 "vector_operand")) + (match_operand:VFH_AVX512VL 5 "nonimm_or_0_operand") (match_operand:<avx512fmaskmode> 1 "register_operand")))] "TARGET_AVX512F" { @@ -5080,15 +5134,15 @@ (set_attr "mode" "<MODE>")]) (define_expand "cond_fnms<mode>" - [(set (match_operand:VF_AVX512VL 0 "register_operand") - (vec_merge:VF_AVX512VL - (fma:VF_AVX512VL - (neg:VF_AVX512VL - (match_operand:VF_AVX512VL 2 "vector_operand")) - (match_operand:VF_AVX512VL 3 "vector_operand") - (neg:VF_AVX512VL - (match_operand:VF_AVX512VL 4 "vector_operand"))) - (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand") + [(set (match_operand:VFH_AVX512VL 0 "register_operand") + (vec_merge:VFH_AVX512VL + (fma:VFH_AVX512VL + (neg:VFH_AVX512VL + (match_operand:VFH_AVX512VL 2 "vector_operand")) + (match_operand:VFH_AVX512VL 3 "vector_operand") + (neg:VFH_AVX512VL + (match_operand:VFH_AVX512VL 4 "vector_operand"))) + (match_operand:VFH_AVX512VL 5 "nonimm_or_0_operand") (match_operand:<avx512fmaskmode> 1 "register_operand")))] "TARGET_AVX512F" { @@ -5793,6 +5847,168 @@ [(set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Complex type operations +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_int_iterator UNSPEC_COMPLEX_F_C_MA + [UNSPEC_COMPLEX_FMA UNSPEC_COMPLEX_FCMA]) + +(define_int_iterator UNSPEC_COMPLEX_F_C_MUL + [UNSPEC_COMPLEX_FMUL UNSPEC_COMPLEX_FCMUL]) + +(define_int_attr complexopname + [(UNSPEC_COMPLEX_FMA "fmaddc") + (UNSPEC_COMPLEX_FCMA "fcmaddc") + (UNSPEC_COMPLEX_FMUL "fmulc") + (UNSPEC_COMPLEX_FCMUL "fcmulc")]) + +(define_expand "<avx512>_fmaddc_<mode>_maskz<round_expand_name>" + [(match_operand:VF_AVX512FP16VL 0 "register_operand") + (match_operand:VF_AVX512FP16VL 1 "<round_expand_nimm_predicate>") + (match_operand:VF_AVX512FP16VL 2 "<round_expand_nimm_predicate>") + (match_operand:VF_AVX512FP16VL 3 "<round_expand_nimm_predicate>") + (match_operand:<avx512fmaskcmode> 4 "register_operand")] + "TARGET_AVX512FP16 && <round_mode512bit_condition>" +{ + emit_insn (gen_fma_fmaddc_<mode>_maskz_1<round_expand_name> ( + operands[0], operands[1], operands[2], operands[3], + CONST0_RTX (<MODE>mode), operands[4]<round_expand_operand>)); + DONE; +}) + +(define_expand "<avx512>_fcmaddc_<mode>_maskz<round_expand_name>" + [(match_operand:VF_AVX512FP16VL 0 "register_operand") + (match_operand:VF_AVX512FP16VL 1 "<round_expand_nimm_predicate>") + (match_operand:VF_AVX512FP16VL 2 "<round_expand_nimm_predicate>") + (match_operand:VF_AVX512FP16VL 3 "<round_expand_nimm_predicate>") + (match_operand:<avx512fmaskcmode> 4 "register_operand")] + "TARGET_AVX512FP16 && <round_mode512bit_condition>" +{ + emit_insn (gen_fma_fcmaddc_<mode>_maskz_1<round_expand_name> ( + operands[0], operands[1], operands[2], operands[3], + CONST0_RTX (<MODE>mode), operands[4]<round_expand_operand>)); + DONE; +}) + +(define_insn "fma_<complexopname>_<mode><sdc_maskz_name><round_name>" + [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v") + (unspec:VF_AVX512FP16VL + [(match_operand:VF_AVX512FP16VL 1 "<round_nimm_predicate>" "%v") + (match_operand:VF_AVX512FP16VL 2 "<round_nimm_predicate>" "<round_constraint>") + (match_operand:VF_AVX512FP16VL 3 "<round_nimm_predicate>" "0")] + UNSPEC_COMPLEX_F_C_MA))] + "TARGET_AVX512FP16 && <sdc_mask_mode512bit_condition> && <round_mode512bit_condition>" + "v<complexopname><ssemodesuffix>\t{<round_sdc_mask_op4>%2, %1, %0<sdc_mask_op4>|%0<sdc_mask_op4>, %1, %2<round_sdc_mask_op4>}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +(define_insn "<avx512>_<complexopname>_<mode>_mask<round_name>" + [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v") + (vec_merge:VF_AVX512FP16VL + (unspec:VF_AVX512FP16VL + [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "%v") + (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>") + (match_operand:VF_AVX512FP16VL 3 "register_operand" "0")] + UNSPEC_COMPLEX_F_C_MA) + (match_dup 1) + (unspec:<avx512fmaskmode> + [(match_operand:<avx512fmaskcmode> 4 "register_operand" "Yk")] + UNSPEC_COMPLEX_MASK)))] + "TARGET_AVX512FP16 && <round_mode512bit_condition>" + "v<complexopname><ssemodesuffix>\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %1, %2<round_op5>}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +(define_insn "<avx512>_<complexopname>_<mode><maskc_name><round_name>" + [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v") + (unspec:VF_AVX512FP16VL + [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "%v") + (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>")] + UNSPEC_COMPLEX_F_C_MUL))] + "TARGET_AVX512FP16 && <round_mode512bit_condition>" + "v<complexopname><ssemodesuffix>\t{<round_maskc_op3>%2, %1, %0<maskc_operand3>|%0<maskc_operand3>, %1, %2<round_maskc_op3>}" + [(set_attr "type" "ssemul") + (set_attr "mode" "<MODE>")]) + +(define_expand "avx512fp16_fmaddcsh_v8hf_maskz<round_expand_name>" + [(match_operand:V8HF 0 "register_operand") + (match_operand:V8HF 1 "<round_expand_nimm_predicate>") + (match_operand:V8HF 2 "<round_expand_nimm_predicate>") + (match_operand:V8HF 3 "<round_expand_nimm_predicate>") + (match_operand:QI 4 "register_operand")] + "TARGET_AVX512FP16 && <round_mode512bit_condition>" +{ + emit_insn (gen_avx512fp16_fma_fmaddcsh_v8hf_maskz<round_expand_name> ( + operands[0], operands[1], operands[2], operands[3], + CONST0_RTX (V8HFmode), operands[4]<round_expand_operand>)); + DONE; +}) + +(define_expand "avx512fp16_fcmaddcsh_v8hf_maskz<round_expand_name>" + [(match_operand:V8HF 0 "register_operand") + (match_operand:V8HF 1 "<round_expand_nimm_predicate>") + (match_operand:V8HF 2 "<round_expand_nimm_predicate>") + (match_operand:V8HF 3 "<round_expand_nimm_predicate>") + (match_operand:QI 4 "register_operand")] + "TARGET_AVX512FP16 && <round_mode512bit_condition>" +{ + emit_insn (gen_avx512fp16_fma_fcmaddcsh_v8hf_maskz<round_expand_name> ( + operands[0], operands[1], operands[2], operands[3], + CONST0_RTX (V8HFmode), operands[4]<round_expand_operand>)); + DONE; +}) + +(define_insn "avx512fp16_fma_<complexopname>sh_v8hf<mask_scalarcz_name><round_scalarcz_name>" + [(set (match_operand:V8HF 0 "register_operand" "=&v") + (vec_merge:V8HF + (unspec:V8HF + [(match_operand:V8HF 1 "<round_scalarcz_nimm_predicate>" "v") + (match_operand:V8HF 2 "<round_scalarcz_nimm_predicate>" "<round_scalarcz_constraint>") + (match_operand:V8HF 3 "<round_scalarcz_nimm_predicate>" "0")] + UNSPEC_COMPLEX_F_C_MA) + (match_dup 2) + (const_int 3)))] + "TARGET_AVX512FP16" + "v<complexopname>sh\t{<round_scalarcz_mask_op4>%2, %1, %0<mask_scalarcz_operand4>|%0<mask_scalarcz_operand4>, %1, %2<round_scalarcz_maskcz_mask_op4>}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "V8HF")]) + +(define_insn "avx512fp16_<complexopname>sh_v8hf_mask<round_name>" + [(set (match_operand:V8HF 0 "register_operand" "=&v") + (vec_merge:V8HF + (vec_merge:V8HF + (unspec:V8HF + [(match_operand:V8HF 1 "<round_nimm_predicate>" "v") + (match_operand:V8HF 2 "<round_nimm_predicate>" "<round_constraint>") + (match_operand:V8HF 3 "<round_nimm_predicate>" "0")] + UNSPEC_COMPLEX_F_C_MA) + (match_dup 1) + (unspec:QI [(match_operand:QI 4 "register_operand" "Yk")] + UNSPEC_COMPLEX_MASK)) + (match_dup 2) + (const_int 3)))] + "TARGET_AVX512FP16" + "v<complexopname>sh\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %1, %2<round_op5>}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "V8HF")]) + +(define_insn "avx512fp16_<complexopname>sh_v8hf<mask_scalarc_name><round_scalarcz_name>" + [(set (match_operand:V8HF 0 "register_operand" "=&v") + (vec_merge:V8HF + (unspec:V8HF + [(match_operand:V8HF 1 "nonimmediate_operand" "v") + (match_operand:V8HF 2 "<round_scalarcz_nimm_predicate>" "<round_scalarcz_constraint>")] + UNSPEC_COMPLEX_F_C_MUL) + (match_dup 1) + (const_int 3)))] + "TARGET_AVX512FP16" + "v<complexopname>sh\t{<round_scalarc_mask_op3>%2, %1, %0<mask_scalarc_operand3>|%0<mask_scalarc_operand3>, %1, %2<round_scalarc_mask_op3>}" + [(set_attr "type" "ssemul") + (set_attr "mode" "V8HF")]) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel half-precision floating point conversion operations @@ -5824,6 +6040,12 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_expand "float<floatunssuffix><mode><ssePHmodelower>2" + [(set (match_operand:<ssePHmode> 0 "register_operand") + (any_float:<ssePHmode> + (match_operand:VI2H_AVX512VL 1 "nonimmediate_operand")))] + "TARGET_AVX512FP16") + (define_insn "avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode><mask_name><round_name>" [(set (match_operand:<ssePHmode> 0 "register_operand" "=v") (any_float:<ssePHmode> @@ -5834,11 +6056,23 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) -(define_expand "avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode>" - [(set (match_operand:V8HF 0 "register_operand" "=v") +(define_expand "float<floatunssuffix><mode>v4hf2" + [(set (match_operand:V4HF 0 "register_operand") + (any_float:V4HF + (match_operand:VI4_128_8_256 1 "vector_operand")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" +{ + operands[0] = lowpart_subreg (V8HFmode, operands[0], V4HFmode); + emit_insn (gen_avx512fp16_float<floatunssuffix><mode>v4hf2 (operands[0], + operands[1])); + DONE; +}) + +(define_expand "avx512fp16_float<floatunssuffix><mode>v4hf2" + [(set (match_operand:V8HF 0 "register_operand") (vec_concat:V8HF - (any_float:V4HF (match_operand:VI4_128_8_256 1 "vector_operand" "vm")) - (match_dup 2)))] + (any_float:V4HF (match_operand:VI4_128_8_256 1 "vector_operand")) + (match_dup 2)))] "TARGET_AVX512FP16 && TARGET_AVX512VL" "operands[2] = CONST0_RTX (V4HFmode);") @@ -5897,11 +6131,23 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) -(define_expand "avx512fp16_vcvt<floatsuffix>qq2ph_v2di" - [(set (match_operand:V8HF 0 "register_operand" "=v") +(define_expand "float<floatunssuffix>v2div2hf2" + [(set (match_operand:V2HF 0 "register_operand") + (any_float:V2HF + (match_operand:V2DI 1 "vector_operand")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" +{ + operands[0] = lowpart_subreg (V8HFmode, operands[0], V2HFmode); + emit_insn (gen_avx512fp16_float<floatunssuffix>v2div2hf2 (operands[0], + operands[1])); + DONE; +}) + +(define_expand "avx512fp16_float<floatunssuffix>v2div2hf2" + [(set (match_operand:V8HF 0 "register_operand") (vec_concat:V8HF - (any_float:V2HF (match_operand:V2DI 1 "vector_operand" "vm")) - (match_dup 2)))] + (any_float:V2HF (match_operand:V2DI 1 "vector_operand")) + (match_dup 2)))] "TARGET_AVX512FP16 && TARGET_AVX512VL" "operands[2] = CONST0_RTX (V6HFmode);") @@ -6000,6 +6246,12 @@ (set_attr "prefix" "evex") (set_attr "mode" "HF")]) +(define_expand "fix<fixunssuffix>_trunc<ssePHmodelower><mode>2" + [(set (match_operand:VI2H_AVX512VL 0 "register_operand") + (any_fix:VI2H_AVX512VL + (match_operand:<ssePHmode> 1 "nonimmediate_operand")))] + "TARGET_AVX512FP16") + (define_insn "avx512fp16_fix<fixunssuffix>_trunc<mode>2<mask_name><round_saeonly_name>" [(set (match_operand:VI2H_AVX512VL 0 "register_operand" "=v") (any_fix:VI2H_AVX512VL @@ -6010,6 +6262,21 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_expand "fix<fixunssuffix>_truncv4hf<mode>2" + [(set (match_operand:VI4_128_8_256 0 "register_operand") + (any_fix:VI4_128_8_256 + (match_operand:V4HF 1 "nonimmediate_operand")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" +{ + if (!MEM_P (operands[1])) + { + operands[1] = lowpart_subreg (V8HFmode, operands[1], V4HFmode); + emit_insn (gen_avx512fp16_fix<fixunssuffix>_trunc<mode>2 (operands[0], + operands[1])); + DONE; + } +}) + (define_insn "avx512fp16_fix<fixunssuffix>_trunc<mode>2<mask_name>" [(set (match_operand:VI4_128_8_256 0 "register_operand" "=v") (any_fix:VI4_128_8_256 @@ -6032,6 +6299,21 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_expand "fix<fixunssuffix>_truncv2hfv2di2" + [(set (match_operand:V2DI 0 "register_operand") + (any_fix:V2DI + (match_operand:V2HF 1 "nonimmediate_operand")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" +{ + if (!MEM_P (operands[1])) + { + operands[1] = lowpart_subreg (V8HFmode, operands[1], V2HFmode); + emit_insn (gen_avx512fp16_fix<fixunssuffix>_truncv2di2 (operands[0], + operands[1])); + DONE; + } +}) + (define_insn "avx512fp16_fix<fixunssuffix>_truncv2di2<mask_name>" [(set (match_operand:V2DI 0 "register_operand" "=v") (any_fix:V2DI @@ -6080,6 +6362,12 @@ [(V16SF "x") (V8SF "x") (V4SF "x") (V8DF "") (V4DF "") (V2DF "")]) +(define_expand "extend<ssePHmodelower><mode>2" + [(set (match_operand:VF48H_AVX512VL 0 "register_operand") + (float_extend:VF48H_AVX512VL + (match_operand:<ssePHmode> 1 "nonimmediate_operand")))] + "TARGET_AVX512FP16") + (define_insn "avx512fp16_float_extend_ph<mode>2<mask_name><round_saeonly_name>" [(set (match_operand:VF48H_AVX512VL 0 "register_operand" "=v") (float_extend:VF48H_AVX512VL @@ -6090,6 +6378,21 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_expand "extendv4hf<mode>2" + [(set (match_operand:VF4_128_8_256 0 "register_operand") + (float_extend:VF4_128_8_256 + (match_operand:V4HF 1 "nonimmediate_operand")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" +{ + if (!MEM_P (operands[1])) + { + operands[1] = lowpart_subreg (V8HFmode, operands[1], V4HFmode); + emit_insn (gen_avx512fp16_float_extend_ph<mode>2 + (operands[0], operands[1])); + DONE; + } +}) + (define_insn "avx512fp16_float_extend_ph<mode>2<mask_name>" [(set (match_operand:VF4_128_8_256 0 "register_operand" "=v") (float_extend:VF4_128_8_256 @@ -6112,6 +6415,21 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_expand "extendv2hfv2df2" + [(set (match_operand:V2DF 0 "register_operand") + (float_extend:V2DF + (match_operand:V2HF 1 "nonimmediate_operand")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" +{ + if (!MEM_P (operands[1])) + { + operands[1] = lowpart_subreg (V8HFmode, operands[1], V2HFmode); + emit_insn (gen_avx512fp16_float_extend_phv2df2 + (operands[0], operands[1])); + DONE; + } +}) + (define_insn "avx512fp16_float_extend_phv2df2<mask_name>" [(set (match_operand:V2DF 0 "register_operand" "=v") (float_extend:V2DF @@ -6134,6 +6452,12 @@ (set_attr "prefix" "evex") (set_attr "mode" "TI")]) +(define_expand "trunc<mode><ssePHmodelower>2" + [(set (match_operand:<ssePHmode> 0 "register_operand") + (float_truncate:<ssePHmode> + (match_operand:VF48H_AVX512VL 1 "nonimmediate_operand")))] + "TARGET_AVX512FP16") + (define_insn "avx512fp16_vcvt<castmode>2ph_<mode><mask_name><round_name>" [(set (match_operand:<ssePHmode> 0 "register_operand" "=v") (float_truncate:<ssePHmode> @@ -6144,11 +6468,21 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) -(define_expand "avx512fp16_vcvt<castmode>2ph_<mode>" - [(set (match_operand:V8HF 0 "register_operand" "=v") +(define_expand "trunc<mode>v4hf2" + [(set (match_operand:V4HF 0 "register_operand") + (float_truncate:V4HF (match_operand:VF4_128_8_256 1 "vector_operand")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" +{ + operands[0] = lowpart_subreg (V8HFmode, operands[0], V4HFmode); + emit_insn (gen_avx512fp16_trunc<mode>v4hf2 (operands[0], operands[1])); + DONE; +}) + +(define_expand "avx512fp16_trunc<mode>v4hf2" + [(set (match_operand:V8HF 0 "register_operand") (vec_concat:V8HF (float_truncate:V4HF - (match_operand:VF4_128_8_256 1 "vector_operand" "vm")) + (match_operand:VF4_128_8_256 1 "vector_operand")) (match_dup 2)))] "TARGET_AVX512FP16 && TARGET_AVX512VL" "operands[2] = CONST0_RTX (V4HFmode);") @@ -6213,11 +6547,20 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) -(define_expand "avx512fp16_vcvtpd2ph_v2df" - [(set (match_operand:V8HF 0 "register_operand" "=v") +(define_expand "truncv2dfv2hf2" + [(set (match_operand:V2HF 0 "register_operand") + (float_truncate:V2HF (match_operand:V2DF 1 "vector_operand")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" +{ + operands[0] = lowpart_subreg (V8HFmode, operands[0], V2HFmode); + emit_insn (gen_avx512fp16_truncv2dfv2hf2 (operands[0], operands[1])); + DONE; +}) + +(define_expand "avx512fp16_truncv2dfv2hf2" + [(set (match_operand:V8HF 0 "register_operand") (vec_concat:V8HF - (float_truncate:V2HF - (match_operand:V2DF 1 "vector_operand" "vm")) + (float_truncate:V2HF (match_operand:V2DF 1 "vector_operand")) (match_dup 2)))] "TARGET_AVX512FP16 && TARGET_AVX512VL" "operands[2] = CONST0_RTX (V6HFmode);") @@ -15229,6 +15572,21 @@ DONE; }) +(define_expand "vcondu<mode><sseintvecmodelower>" + [(set (match_operand:VF_AVX512FP16VL 0 "register_operand") + (if_then_else:VF_AVX512FP16VL + (match_operator 3 "" + [(match_operand:<sseintvecmode> 4 "vector_operand") + (match_operand:<sseintvecmode> 5 "vector_operand")]) + (match_operand:VF_AVX512FP16VL 1 "general_operand") + (match_operand:VF_AVX512FP16VL 2 "general_operand")))] + "TARGET_AVX512FP16" +{ + bool ok = ix86_expand_int_vcond (operands); + gcc_assert (ok); + DONE; +}) + (define_expand "vcondeq<VI8F_128:mode>v2di" [(set (match_operand:VI8F_128 0 "register_operand") (if_then_else:VI8F_128 @@ -21298,14 +21656,14 @@ (set_attr "mode" "<MODE>")]) (define_insn "*sse4_1_round<ssescalarmodesuffix>" - [(set (match_operand:VF_128 0 "register_operand" "=Yr,*x,x,v") - (vec_merge:VF_128 - (vec_duplicate:VF_128 + [(set (match_operand:VFH_128 0 "register_operand" "=Yr,*x,x,v") + (vec_merge:VFH_128 + (vec_duplicate:VFH_128 (unspec:<ssescalarmode> [(match_operand:<ssescalarmode> 2 "nonimmediate_operand" "Yrm,*xm,xm,vm") (match_operand:SI 3 "const_0_to_15_operand" "n,n,n,n")] UNSPEC_ROUND)) - (match_operand:VF_128 1 "register_operand" "0,0,x,v") + (match_operand:VFH_128 1 "register_operand" "0,0,x,v") (const_int 1)))] "TARGET_SSE4_1" "@ diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md index 157d49f..11e62c6 100644 --- a/gcc/config/i386/subst.md +++ b/gcc/config/i386/subst.md @@ -28,6 +28,9 @@ V16SF V8SF V4SF V8DF V4DF V2DF]) +(define_mode_iterator SUBST_CV + [V32HF V16HF V8HF]) + (define_mode_iterator SUBST_S [QI HI SI DI]) @@ -42,9 +45,11 @@ QI HI SI DI SF DF]) (define_subst_attr "mask_name" "mask" "" "_mask") +(define_subst_attr "maskc_name" "maskc" "" "_mask") (define_subst_attr "mask_applied" "mask" "false" "true") (define_subst_attr "mask_operand2" "mask" "" "%{%3%}%N2") (define_subst_attr "mask_operand3" "mask" "" "%{%4%}%N3") +(define_subst_attr "maskc_operand3" "maskc" "" "%{%4%}%N3") (define_subst_attr "mask_operand3_1" "mask" "" "%%{%%4%%}%%N3") ;; for sprintf (define_subst_attr "mask_operand4" "mask" "" "%{%5%}%N4") (define_subst_attr "mask_operand6" "mask" "" "%{%7%}%N6") @@ -89,6 +94,18 @@ (match_dup 0) (match_operand:<avx512fmaskmode> 2 "register_operand" "Yk")))]) +(define_subst "maskc" + [(set (match_operand:SUBST_CV 0) + (match_operand:SUBST_CV 1))] + "TARGET_AVX512F" + [(set (match_dup 0) + (vec_merge:SUBST_CV + (match_dup 1) + (match_operand:SUBST_CV 2 "nonimm_or_0_operand" "0C") + (unspec:<avx512fmaskmode> + [(match_operand:<avx512fmaskcmode> 3 "register_operand" "Yk")] + UNSPEC_COMPLEX_MASK)))]) + (define_subst_attr "mask_scalar_merge_name" "mask_scalar_merge" "" "_mask") (define_subst_attr "mask_scalar_merge_operand3" "mask_scalar_merge" "" "%{%3%}") (define_subst_attr "mask_scalar_merge_operand4" "mask_scalar_merge" "" "%{%4%}") @@ -137,12 +154,31 @@ (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")) (match_dup 2) (const_int 1)))]) +(define_subst_attr "sdc_maskz_name" "sdc" "" "_maskz_1") +(define_subst_attr "sdc_mask_op4" "sdc" "" "%{%5%}%N4") +(define_subst_attr "sdc_mask_op5" "sdc" "" "%{%6%}%N5") +(define_subst_attr "sdc_mask_mode512bit_condition" "sdc" "1" "(<MODE_SIZE> == 64 || TARGET_AVX512VL)") + +(define_subst "sdc" + [(set (match_operand:SUBST_CV 0) + (match_operand:SUBST_CV 1))] + "" + [(set (match_dup 0) + (vec_merge:SUBST_CV + (match_dup 1) + (match_operand:SUBST_CV 2 "const0_operand" "C") + (unspec:<avx512fmaskmode> + [(match_operand:<avx512fmaskcmode> 3 "register_operand" "Yk")] + UNSPEC_COMPLEX_MASK))) +]) (define_subst_attr "round_name" "round" "" "_round") (define_subst_attr "round_mask_operand2" "mask" "%R2" "%R4") (define_subst_attr "round_mask_operand3" "mask" "%R3" "%R5") +(define_subst_attr "round_maskc_operand3" "maskc" "%R3" "%R5") (define_subst_attr "round_mask_operand4" "mask" "%R4" "%R6") (define_subst_attr "round_sd_mask_operand4" "sd" "%R4" "%R6") +(define_subst_attr "round_sdc_mask_operand4" "sdc" "%R4" "%R6") (define_subst_attr "round_op2" "round" "" "%R2") (define_subst_attr "round_op3" "round" "" "%R3") (define_subst_attr "round_op4" "round" "" "%R4") @@ -150,8 +186,10 @@ (define_subst_attr "round_op6" "round" "" "%R6") (define_subst_attr "round_mask_op2" "round" "" "<round_mask_operand2>") (define_subst_attr "round_mask_op3" "round" "" "<round_mask_operand3>") +(define_subst_attr "round_maskc_op3" "round" "" "<round_maskc_operand3>") (define_subst_attr "round_mask_op4" "round" "" "<round_mask_operand4>") (define_subst_attr "round_sd_mask_op4" "round" "" "<round_sd_mask_operand4>") +(define_subst_attr "round_sdc_mask_op4" "round" "" "<round_sdc_mask_operand4>") (define_subst_attr "round_constraint" "round" "vm" "v") (define_subst_attr "round_qq2phsuff" "round" "<qq2phsuff>" "") (define_subst_attr "bcst_round_constraint" "round" "vmBr" "v") @@ -189,6 +227,7 @@ (define_subst_attr "round_saeonly_mask_scalar_merge_operand4" "mask_scalar_merge" "%r4" "%r5") (define_subst_attr "round_saeonly_maskz_scalar_operand5" "maskz_scalar" "%r5" "%r7") (define_subst_attr "round_saeonly_sd_mask_operand5" "sd" "%r5" "%r7") +(define_subst_attr "round_saeonly_sdc_mask_operand5" "sdc" "%r5" "%r7") (define_subst_attr "round_saeonly_op2" "round_saeonly" "" "%r2") (define_subst_attr "round_saeonly_op3" "round_saeonly" "" "%r3") (define_subst_attr "round_saeonly_op4" "round_saeonly" "" "%r4") @@ -289,8 +328,12 @@ (match_operand:<avx512fmaskmode> 5 "register_operand")]) (define_subst_attr "mask_scalar_name" "mask_scalar" "" "_mask") +(define_subst_attr "mask_scalarcz_name" "mask_scalarcz" "" "_maskz") +(define_subst_attr "mask_scalarc_name" "mask_scalarc" "" "_mask") +(define_subst_attr "mask_scalarc_operand3" "mask_scalarc" "" "%{%4%}%N3") (define_subst_attr "mask_scalar_operand3" "mask_scalar" "" "%{%4%}%N3") (define_subst_attr "mask_scalar_operand4" "mask_scalar" "" "%{%5%}%N4") +(define_subst_attr "mask_scalarcz_operand4" "mask_scalarcz" "" "%{%5%}%N4") (define_subst "mask_scalar" [(set (match_operand:SUBST_V 0) @@ -308,12 +351,55 @@ (match_dup 2) (const_int 1)))]) +(define_subst "mask_scalarcz" + [(set (match_operand:SUBST_CV 0) + (vec_merge:SUBST_CV + (match_operand:SUBST_CV 1) + (match_operand:SUBST_CV 2) + (const_int 3)))] + "TARGET_AVX512F" + [(set (match_dup 0) + (vec_merge:SUBST_CV + (vec_merge:SUBST_CV + (match_dup 1) + (match_operand:SUBST_CV 3 "const0_operand" "C") + (unspec:<avx512fmaskmode> + [(match_operand:<avx512fmaskcmode> 4 "register_operand" "Yk")] + UNSPEC_COMPLEX_MASK)) + (match_dup 2) + (const_int 3)))]) + +(define_subst "mask_scalarc" + [(set (match_operand:SUBST_CV 0) + (vec_merge:SUBST_CV + (match_operand:SUBST_CV 1) + (match_operand:SUBST_CV 2) + (const_int 3)))] + "TARGET_AVX512F" + [(set (match_dup 0) + (vec_merge:SUBST_CV + (vec_merge:SUBST_CV + (match_dup 1) + (match_operand:SUBST_CV 3 "nonimm_or_0_operand" "0C") + (unspec:<avx512fmaskmode> + [(match_operand:<avx512fmaskcmode> 4 "register_operand" "Yk")] + UNSPEC_COMPLEX_MASK)) + (match_dup 2) + (const_int 3)))]) + (define_subst_attr "round_scalar_name" "round_scalar" "" "_round") +(define_subst_attr "round_scalarcz_name" "round_scalarcz" "" "_round") (define_subst_attr "round_scalar_mask_operand3" "mask_scalar" "%R3" "%R5") +(define_subst_attr "round_scalarc_mask_operand3" "mask_scalarc" "%R3" "%R5") +(define_subst_attr "round_scalarcz_mask_operand4" "mask_scalarcz" "%R4" "%R6") (define_subst_attr "round_scalar_mask_op3" "round_scalar" "" "<round_scalar_mask_operand3>") +(define_subst_attr "round_scalarc_mask_op3" "round_scalarcz" "" "<round_scalarc_mask_operand3>") +(define_subst_attr "round_scalarcz_mask_op4" "round_scalarcz" "" "<round_scalarcz_mask_operand4>") (define_subst_attr "round_scalar_constraint" "round_scalar" "vm" "v") +(define_subst_attr "round_scalarcz_constraint" "round_scalarcz" "vm" "v") (define_subst_attr "round_scalar_prefix" "round_scalar" "vex" "evex") (define_subst_attr "round_scalar_nimm_predicate" "round_scalar" "nonimmediate_operand" "register_operand") +(define_subst_attr "round_scalarcz_nimm_predicate" "round_scalarcz" "vector_operand" "register_operand") (define_subst "round_scalar" [(set (match_operand:SUBST_V 0) @@ -331,6 +417,22 @@ (match_operand:SI 3 "const_4_or_8_to_11_operand")] UNSPEC_EMBEDDED_ROUNDING))]) +(define_subst "round_scalarcz" + [(set (match_operand:SUBST_V 0) + (vec_merge:SUBST_V + (match_operand:SUBST_V 1) + (match_operand:SUBST_V 2) + (const_int 3)))] + "TARGET_AVX512F" + [(set (match_dup 0) + (unspec:SUBST_V [ + (vec_merge:SUBST_V + (match_dup 1) + (match_dup 2) + (const_int 3)) + (match_operand:SI 3 "const_4_or_8_to_11_operand")] + UNSPEC_EMBEDDED_ROUNDING))]) + (define_subst_attr "round_saeonly_scalar_name" "round_saeonly_scalar" "" "_round") (define_subst_attr "round_saeonly_scalar_mask_operand3" "mask_scalar" "%r3" "%r5") (define_subst_attr "round_saeonly_scalar_mask_operand4" "mask_scalar" "%r4" "%r6") diff --git a/gcc/config/lm32/uclinux-elf.h b/gcc/config/lm32/uclinux-elf.h index 370df4c5..5b638fa 100644 --- a/gcc/config/lm32/uclinux-elf.h +++ b/gcc/config/lm32/uclinux-elf.h @@ -67,6 +67,7 @@ #define TARGET_OS_CPP_BUILTINS() GNU_USER_TARGET_OS_CPP_BUILTINS() +#undef LINK_GCC_C_SEQUENCE_SPEC #define LINK_GCC_C_SEQUENCE_SPEC \ "%{static|static-pie:--start-group} %G %{!nolibc:%L} \ %{static|static-pie:--end-group}%{!static:%{!static-pie:%G}}" diff --git a/gcc/config/pa/pa.c b/gcc/config/pa/pa.c index 0614302..69ba5bd 100644 --- a/gcc/config/pa/pa.c +++ b/gcc/config/pa/pa.c @@ -541,6 +541,16 @@ pa_option_override (void) write_symbols = NO_DEBUG; } + if (TARGET_64BIT && TARGET_HPUX) + { + /* DWARF5 is not supported by gdb. Don't emit DWARF5 unless + specifically selected. */ + if (!global_options_set.x_dwarf_strict) + dwarf_strict = 1; + if (!global_options_set.x_dwarf_version) + dwarf_version = 4; + } + /* We only support the "big PIC" model now. And we always generate PIC code when in 64bit mode. */ if (flag_pic == 1 || TARGET_64BIT) diff --git a/gcc/config/pru/constraints.md b/gcc/config/pru/constraints.md index a31ae93..1e0e703 100644 --- a/gcc/config/pru/constraints.md +++ b/gcc/config/pru/constraints.md @@ -34,6 +34,7 @@ ;; The following constraints are intended for internal use only: ;; Rmd0, Rms0, Rms1: Registers for MUL instruction operands. ;; Rsib: Jump address register suitable for sibling calls. +;; Rrio: The R30 and R31 I/O registers. ;; M: -255 to 0 (for converting ADD to SUB with suitable UBYTE OP2). ;; N: -32768 to 32767 (16-bit signed integer). ;; O: -128 to 127 (8-bit signed integer). @@ -57,6 +58,10 @@ "@internal The multiply source 1 register.") +(define_register_constraint "Rrio" "REGIO_REGS" + "@internal + The R30 and R31 I/O registers.") + ;; Integer constraints. (define_constraint "I" diff --git a/gcc/config/pru/predicates.md b/gcc/config/pru/predicates.md index 469002f..1a4b98e 100644 --- a/gcc/config/pru/predicates.md +++ b/gcc/config/pru/predicates.md @@ -121,6 +121,25 @@ return 0; }) +(define_predicate "regio_operand" + (match_code "subreg,reg") +{ + if (register_operand (op, mode)) + { + int regno; + + if (REG_P (op)) + regno = REGNO (op); + else if (GET_CODE (op) == SUBREG && REG_P (SUBREG_REG (op))) + regno = REGNO (SUBREG_REG (op)); + else + return 0; + + return REGNO_REG_CLASS (regno) == REGIO_REGS; + } + return 0; +}) + (define_predicate "reg_or_const_int_operand" (ior (match_operand 0 "const_int_operand") (match_operand 0 "register_operand"))) diff --git a/gcc/config/pru/pru-pragma.c b/gcc/config/pru/pru-pragma.c index 01d0761..3beec23 100644 --- a/gcc/config/pru/pru-pragma.c +++ b/gcc/config/pru/pru-pragma.c @@ -83,4 +83,6 @@ pru_register_pragmas (void) { c_register_pragma (NULL, "ctable_entry", pru_pragma_ctable_entry); c_register_pragma (NULL, "CTABLE_ENTRY", pru_pragma_ctable_entry); + + c_register_addr_space ("__regio_symbol", ADDR_SPACE_REGIO); } diff --git a/gcc/config/pru/pru-protos.h b/gcc/config/pru/pru-protos.h index 74129e9..031ea9e 100644 --- a/gcc/config/pru/pru-protos.h +++ b/gcc/config/pru/pru-protos.h @@ -62,7 +62,10 @@ extern int pru_get_ctable_exact_base_index (unsigned HOST_WIDE_INT caddr); extern int pru_get_ctable_base_index (unsigned HOST_WIDE_INT caddr); extern int pru_get_ctable_base_offset (unsigned HOST_WIDE_INT caddr); +extern int pru_symref2ioregno (rtx op); + extern void pru_register_abicheck_pass (void); + #endif /* RTX_CODE */ #ifdef TREE_CODE diff --git a/gcc/config/pru/pru.c b/gcc/config/pru/pru.c index 30d0da1..9f264b4 100644 --- a/gcc/config/pru/pru.c +++ b/gcc/config/pru/pru.c @@ -1403,11 +1403,42 @@ pru_valid_addr_expr_p (machine_mode mode, rtx base, rtx offset, bool strict_p) return false; } -/* Implement TARGET_LEGITIMATE_ADDRESS_P. */ +/* Return register number (either for r30 or r31) which maps to the + corresponding symbol OP's name in the __regio_symbol address namespace. + + If no mapping can be established (i.e. symbol name is invalid), then + return -1. */ +int pru_symref2ioregno (rtx op) +{ + if (!SYMBOL_REF_P (op)) + return -1; + + const char *name = XSTR (op, 0); + if (!strcmp (name, "__R30")) + return R30_REGNUM; + else if (!strcmp (name, "__R31")) + return R31_REGNUM; + else + return -1; +} + +/* Implement TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P. */ static bool -pru_legitimate_address_p (machine_mode mode, - rtx operand, bool strict_p) +pru_addr_space_legitimate_address_p (machine_mode mode, rtx operand, + bool strict_p, addr_space_t as) { + if (as == ADDR_SPACE_REGIO) + { + /* Address space constraints for __regio_symbol have been checked in + TARGET_INSERT_ATTRIBUTES, and some more checks will be done + during RTL expansion of "mov<mode>". */ + return true; + } + else if (as != ADDR_SPACE_GENERIC) + { + gcc_unreachable (); + } + switch (GET_CODE (operand)) { /* Direct. */ @@ -2002,6 +2033,117 @@ pru_file_start (void) need to confuse users with this warning. */ fprintf (asm_out_file, "\t.set no_warn_regname_label\n"); } + +/* Scan type TYP for pointer references to address space other than + ADDR_SPACE_GENERIC. Return true if such reference is found. + Much of this code was taken from the avr port. */ + +static bool +pru_nongeneric_pointer_addrspace (tree typ) +{ + while (ARRAY_TYPE == TREE_CODE (typ)) + typ = TREE_TYPE (typ); + + if (POINTER_TYPE_P (typ)) + { + addr_space_t as; + tree target = TREE_TYPE (typ); + + /* Pointer to function: Test the function's return type. */ + if (FUNCTION_TYPE == TREE_CODE (target)) + return pru_nongeneric_pointer_addrspace (TREE_TYPE (target)); + + /* "Ordinary" pointers... */ + + while (TREE_CODE (target) == ARRAY_TYPE) + target = TREE_TYPE (target); + + as = TYPE_ADDR_SPACE (target); + + if (!ADDR_SPACE_GENERIC_P (as)) + return true; + + /* Scan pointer's target type. */ + return pru_nongeneric_pointer_addrspace (target); + } + + return false; +} + +/* Implement `TARGET_INSERT_ATTRIBUTES'. For PRU it's used as a hook to + provide better diagnostics for some invalid usages of the __regio_symbol + address space. + + Any escapes of the following checks are supposed to be caught + during the "mov<mode>" pattern expansion. */ + +static void +pru_insert_attributes (tree node, tree *attributes ATTRIBUTE_UNUSED) +{ + + /* Validate __regio_symbol variable declarations. */ + if (VAR_P (node)) + { + const char *name = DECL_NAME (node) + ? IDENTIFIER_POINTER (DECL_NAME (node)) + : "<unknown>"; + tree typ = TREE_TYPE (node); + addr_space_t as = TYPE_ADDR_SPACE (typ); + + if (as == ADDR_SPACE_GENERIC) + return; + + if (AGGREGATE_TYPE_P (typ)) + { + error ("aggregate types are prohibited in " + "%<__regio_symbol%> address space"); + /* Don't bother anymore. Below checks would pile + meaningless errors, which would confuse user. */ + return; + } + if (DECL_INITIAL (node) != NULL_TREE) + error ("variables in %<__regio_symbol%> address space " + "cannot have initial value"); + if (DECL_REGISTER (node)) + error ("variables in %<__regio_symbol%> address space " + "cannot be declared %<register%>"); + if (!TYPE_VOLATILE (typ)) + error ("variables in %<__regio_symbol%> address space " + "must be declared %<volatile%>"); + if (!DECL_EXTERNAL (node)) + error ("variables in %<__regio_symbol%> address space " + "must be declared %<extern%>"); + if (TYPE_MODE (typ) != SImode) + error ("only 32-bit access is supported " + "for %<__regio_symbol%> address space"); + if (strcmp (name, "__R30") != 0 && strcmp (name, "__R31") != 0) + error ("register name %<%s%> not recognized " + "in %<__regio_symbol%> address space", name); + } + + tree typ = NULL_TREE; + + switch (TREE_CODE (node)) + { + case FUNCTION_DECL: + typ = TREE_TYPE (TREE_TYPE (node)); + break; + case TYPE_DECL: + case RESULT_DECL: + case VAR_DECL: + case FIELD_DECL: + case PARM_DECL: + typ = TREE_TYPE (node); + break; + case POINTER_TYPE: + typ = node; + break; + default: + break; + } + if (typ != NULL_TREE && pru_nongeneric_pointer_addrspace (typ)) + error ("pointers to %<__regio_symbol%> address space are prohibited"); +} /* Function argument related. */ @@ -2933,6 +3075,9 @@ pru_unwind_word_mode (void) #undef TARGET_ASM_FILE_START #define TARGET_ASM_FILE_START pru_file_start +#undef TARGET_INSERT_ATTRIBUTES +#define TARGET_INSERT_ATTRIBUTES pru_insert_attributes + #undef TARGET_INIT_BUILTINS #define TARGET_INIT_BUILTINS pru_init_builtins #undef TARGET_EXPAND_BUILTIN @@ -2979,8 +3124,9 @@ pru_unwind_word_mode (void) #undef TARGET_MUST_PASS_IN_STACK #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size -#undef TARGET_LEGITIMATE_ADDRESS_P -#define TARGET_LEGITIMATE_ADDRESS_P pru_legitimate_address_p +#undef TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P +#define TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P \ + pru_addr_space_legitimate_address_p #undef TARGET_INIT_LIBFUNCS #define TARGET_INIT_LIBFUNCS pru_init_libfuncs diff --git a/gcc/config/pru/pru.h b/gcc/config/pru/pru.h index 9b6be32..03f08b1 100644 --- a/gcc/config/pru/pru.h +++ b/gcc/config/pru/pru.h @@ -215,6 +215,7 @@ enum reg_class MULDST_REGS, MULSRC0_REGS, MULSRC1_REGS, + REGIO_REGS, GP_REGS, ALL_REGS, LIM_REG_CLASSES @@ -229,6 +230,7 @@ enum reg_class "MULDST_REGS", \ "MULSRC0_REGS", \ "MULSRC1_REGS", \ + "REGIO_REGS", \ "GP_REGS", \ "ALL_REGS" } @@ -242,6 +244,7 @@ enum reg_class /* MULDST_REGS */ { 0, 0, 0, 0x00000f00, 0}, \ /* MULSRC0_REGS */ { 0, 0, 0, 0x000f0000, 0}, \ /* MULSRC1_REGS */ { 0, 0, 0, 0x00f00000, 0}, \ + /* REGIO_REGS */ { 0, 0, 0, 0xff000000, 0}, \ /* GP_REGS */ { ~0, ~0, ~0, ~0, 0}, \ /* ALL_REGS */ { ~0,~0, ~0, ~0, ~0} \ } @@ -252,6 +255,8 @@ enum reg_class ((REGNO) == MULDST_REGNUM ? MULDST_REGS \ : (REGNO) == MULSRC0_REGNUM ? MULSRC0_REGS \ : (REGNO) == MULSRC1_REGNUM ? MULSRC1_REGS \ + : (REGNO) == R30_REGNUM ? REGIO_REGS \ + : (REGNO) == R31_REGNUM ? REGIO_REGS \ : (REGNO) >= FIRST_ARG_REGNUM \ && (REGNO) <= LAST_ARG_REGNUM ? SIB_REGS \ : (REGNO) == STATIC_CHAIN_REGNUM ? SIB_REGS \ diff --git a/gcc/config/pru/pru.md b/gcc/config/pru/pru.md index e6cfa8e..c0ded8e 100644 --- a/gcc/config/pru/pru.md +++ b/gcc/config/pru/pru.md @@ -36,6 +36,8 @@ (MULSRC0_REGNUM 112) ; Multiply source register. (MULSRC1_REGNUM 116) ; Multiply source register. (LAST_NONIO_GP_REGNUM 119) ; Last non-I/O general purpose register. + (R30_REGNUM 120) ; R30 I/O register. + (R31_REGNUM 124) ; R31 I/O register. (LOOPCNTR_REGNUM 128) ; internal LOOP counter register (LAST_GP_REGNUM 132) ; Last general purpose register. @@ -49,6 +51,13 @@ ] ) +;; Enumerate address spaces. +(define_constants + [ + (ADDR_SPACE_REGIO 1) ; Access to R30 and R31 I/O registers. + ] +) + ;; Enumeration of UNSPECs. (define_c_enum "unspec" [ @@ -68,6 +77,9 @@ UNSPECV_HALT UNSPECV_BLOCKAGE + + UNSPECV_REGIO_READ + UNSPECV_REGIO_WRITE ]) ; Length of an instruction (in bytes). @@ -129,11 +141,62 @@ (match_operand:MOV8_16_32 1 "general_operand"))] "" { - /* It helps to split constant loading and memory access - early, so that the LDI/LDI32 instructions can be hoisted - outside a loop body. */ - if (MEM_P (operands[0])) - operands[1] = force_reg (<MODE>mode, operands[1]); + if (MEM_P (operands[0]) + && MEM_ADDR_SPACE (operands[0]) == ADDR_SPACE_REGIO) + + { + /* Intercept writes to the SImode register I/O "address space". */ + gcc_assert (<MODE>mode == SImode); + + if (!SYMBOL_REF_P (XEXP (operands[0], 0))) + { + error ("invalid access to %<__regio_symbol%> address space"); + FAIL; + } + + if (!REG_P (operands[1])) + operands[1] = force_reg (<MODE>mode, operands[1]); + + int regiono = pru_symref2ioregno (XEXP (operands[0], 0)); + gcc_assert (regiono >= 0); + rtx regio = gen_rtx_REG (<MODE>mode, regiono); + rtx unspecv = gen_rtx_UNSPEC_VOLATILE (<MODE>mode, + gen_rtvec (1, operands[1]), + UNSPECV_REGIO_WRITE); + emit_insn (gen_rtx_SET (regio, unspecv)); + DONE; + } + else if (MEM_P (operands[1]) + && MEM_ADDR_SPACE (operands[1]) == ADDR_SPACE_REGIO) + { + /* Intercept reads from the SImode register I/O "address space". */ + gcc_assert (<MODE>mode == SImode); + + if (!SYMBOL_REF_P (XEXP (operands[1], 0))) + { + error ("invalid access to %<__regio_symbol%> address space"); + FAIL; + } + + if (MEM_P (operands[0])) + operands[0] = force_reg (<MODE>mode, operands[0]); + + int regiono = pru_symref2ioregno (XEXP (operands[1], 0)); + gcc_assert (regiono >= 0); + rtx regio = gen_rtx_REG (<MODE>mode, regiono); + rtx unspecv = gen_rtx_UNSPEC_VOLATILE (<MODE>mode, + gen_rtvec (1, regio), + UNSPECV_REGIO_READ); + emit_insn (gen_rtx_SET (operands[0], unspecv)); + DONE; + } + else if (MEM_P (operands[0])) + { + /* It helps to split constant loading and memory access + early, so that the LDI/LDI32 instructions can be hoisted + outside a loop body. */ + operands[1] = force_reg (<MODE>mode, operands[1]); + } }) ;; Keep a single pattern for 32 bit MOV operations. LRA requires that the @@ -546,6 +609,35 @@ (include "alu-zext.md") +;; Patterns for accessing the R30/R31 I/O registers. + +(define_insn "*regio_readsi" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec_volatile:SI + [(match_operand:SI 1 "regio_operand" "Rrio")] + UNSPECV_REGIO_READ))] + "" + "mov\\t%0, %1" + [(set_attr "type" "alu")]) + +(define_insn "*regio_nozext_writesi" + [(set (match_operand:SI 0 "regio_operand" "=Rrio") + (unspec_volatile:SI + [(match_operand:SI 1 "register_operand" "r")] + UNSPECV_REGIO_WRITE))] + "" + "mov\\t%0, %1" + [(set_attr "type" "alu")]) + +(define_insn "*regio_zext_write_r30<EQS0:mode>" + [(set (match_operand:SI 0 "regio_operand" "=Rrio") + (unspec_volatile:SI + [(zero_extend:SI (match_operand:EQS0 1 "register_operand" "r"))] + UNSPECV_REGIO_WRITE))] + "" + "mov\\t%0, %1" + [(set_attr "type" "alu")]) + ;; DI logical ops could be automatically split into WORD-mode ops in ;; expand_binop(). But then we'll miss an opportunity to use SI mode ;; operations, since WORD mode for PRU is QI. diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index f88877f..98364f0 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -802,7 +802,7 @@ rtx hp = gen_reg_rtx (<MODE>mode); rtx lp = gen_reg_rtx (<MODE>mode); - emit_insn (gen_mul<mode>3_highpart (hp, operands[1], operands[2])); + emit_insn (gen_smul<mode>3_highpart (hp, operands[1], operands[2])); emit_insn (gen_mul<mode>3 (operands[0], operands[1], operands[2])); emit_insn (gen_ashr<mode>3 (lp, operands[0], GEN_INT (BITS_PER_WORD - 1))); @@ -899,14 +899,14 @@ emit_insn (gen_muldi3 (low, operands[1], operands[2])); rtx high = gen_reg_rtx (DImode); - emit_insn (gen_<u>muldi3_highpart (high, operands[1], operands[2])); + emit_insn (gen_<su>muldi3_highpart (high, operands[1], operands[2])); emit_move_insn (gen_lowpart (DImode, operands[0]), low); emit_move_insn (gen_highpart (DImode, operands[0]), high); DONE; }) -(define_insn "<u>muldi3_highpart" +(define_insn "<su>muldi3_highpart" [(set (match_operand:DI 0 "register_operand" "=r") (truncate:DI (lshiftrt:TI @@ -961,13 +961,13 @@ { rtx temp = gen_reg_rtx (SImode); emit_insn (gen_mulsi3 (temp, operands[1], operands[2])); - emit_insn (gen_<u>mulsi3_highpart (riscv_subword (operands[0], true), + emit_insn (gen_<su>mulsi3_highpart (riscv_subword (operands[0], true), operands[1], operands[2])); emit_insn (gen_movsi (riscv_subword (operands[0], false), temp)); DONE; }) -(define_insn "<u>mulsi3_highpart" +(define_insn "<su>mulsi3_highpart" [(set (match_operand:SI 0 "register_operand" "=r") (truncate:SI (lshiftrt:DI diff --git a/gcc/config/rs6000/darwin.h b/gcc/config/rs6000/darwin.h index 6abf8e8..120b01f 100644 --- a/gcc/config/rs6000/darwin.h +++ b/gcc/config/rs6000/darwin.h @@ -203,7 +203,7 @@ /* Make both r2 and r13 available for allocation. */ #define FIXED_R2 0 -#define FIXED_R13 0 +#define FIXED_R13 TARGET_64BIT /* Base register for access to local variables of the function. */ @@ -213,6 +213,9 @@ #undef RS6000_PIC_OFFSET_TABLE_REGNUM #define RS6000_PIC_OFFSET_TABLE_REGNUM 31 +#undef FIRST_SAVED_GP_REGNO +#define FIRST_SAVED_GP_REGNO 13 + /* Darwin's stack must remain 16-byte aligned for both 32 and 64 bit ABIs. */ diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index 7d48548..2eceb2c7 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -6223,11 +6223,19 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { or vector type. If a non-floating point or vector type is found, or if a floating point or vector type that doesn't match a non-VOIDmode *MODEP is found, then return -1, otherwise return the count in the - sub-tree. */ + sub-tree. + + There have been some ABI snafus along the way with C++. Modify + EMPTY_BASE_SEEN to a nonzero value iff a C++ empty base class makes + an appearance; separate flag bits indicate whether or not such a + field is marked "no unique address". Modify ZERO_WIDTH_BF_SEEN + to 1 iff a C++ zero-length bitfield makes an appearance, but + in this case otherwise treat this as still being a homogeneous + aggregate. */ static int rs6000_aggregate_candidate (const_tree type, machine_mode *modep, - int *empty_base_seen) + int *empty_base_seen, int *zero_width_bf_seen) { machine_mode mode; HOST_WIDE_INT size; @@ -6298,7 +6306,8 @@ rs6000_aggregate_candidate (const_tree type, machine_mode *modep, return -1; count = rs6000_aggregate_candidate (TREE_TYPE (type), modep, - empty_base_seen); + empty_base_seen, + zero_width_bf_seen); if (count == -1 || !index || !TYPE_MAX_VALUE (index) @@ -6336,6 +6345,26 @@ rs6000_aggregate_candidate (const_tree type, machine_mode *modep, if (TREE_CODE (field) != FIELD_DECL) continue; + if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field)) + { + /* GCC 11 and earlier generated incorrect code in a rare + corner case for C++. When a RECORD_TYPE looks like a + homogeneous aggregate, except that it also contains + one or more zero-width bit fields, these earlier + compilers would incorrectly pass the fields in FPRs + or VSRs. This occurred because the front end wrongly + removed these bitfields from the RECORD_TYPE. In + GCC 12 and later, the front end flaw was corrected. + We want to diagnose this case. To do this, we pretend + that we don't see the zero-width bit fields (hence + the continue statement here), but pass back a flag + indicating what happened. The caller then diagnoses + the issue and rejects the RECORD_TYPE as a homogeneous + aggregate. */ + *zero_width_bf_seen = 1; + continue; + } + if (DECL_FIELD_ABI_IGNORED (field)) { if (lookup_attribute ("no_unique_address", @@ -6347,7 +6376,8 @@ rs6000_aggregate_candidate (const_tree type, machine_mode *modep, } sub_count = rs6000_aggregate_candidate (TREE_TYPE (field), modep, - empty_base_seen); + empty_base_seen, + zero_width_bf_seen); if (sub_count < 0) return -1; count += sub_count; @@ -6381,7 +6411,8 @@ rs6000_aggregate_candidate (const_tree type, machine_mode *modep, continue; sub_count = rs6000_aggregate_candidate (TREE_TYPE (field), modep, - empty_base_seen); + empty_base_seen, + zero_width_bf_seen); if (sub_count < 0) return -1; count = count > sub_count ? count : sub_count; @@ -6423,8 +6454,10 @@ rs6000_discover_homogeneous_aggregate (machine_mode mode, const_tree type, { machine_mode field_mode = VOIDmode; int empty_base_seen = 0; + int zero_width_bf_seen = 0; int field_count = rs6000_aggregate_candidate (type, &field_mode, - &empty_base_seen); + &empty_base_seen, + &zero_width_bf_seen); if (field_count > 0) { @@ -6460,6 +6493,25 @@ rs6000_discover_homogeneous_aggregate (machine_mode mode, const_tree type, last_reported_type_uid = uid; } } + if (zero_width_bf_seen && warn_psabi) + { + static unsigned last_reported_type_uid; + unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type)); + if (uid != last_reported_type_uid) + { + inform (input_location, + "ELFv2 parameter passing for an argument " + "containing zero-width bit fields but that is " + "otherwise a homogeneous aggregate was " + "corrected in GCC 12"); + last_reported_type_uid = uid; + } + if (elt_mode) + *elt_mode = mode; + if (n_elts) + *n_elts = 1; + return false; + } return true; } } diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 060f51a..ad86072 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -5289,9 +5289,6 @@ struct rs6000_cost_data static void rs6000_density_test (rs6000_cost_data *data) { - const int DENSITY_PCT_THRESHOLD = 85; - const int DENSITY_SIZE_THRESHOLD = 70; - const int DENSITY_PENALTY = 10; struct loop *loop = data->loop_info; basic_block *bbs = get_loop_body (loop); int nbbs = loop->num_nodes; @@ -5327,26 +5324,21 @@ rs6000_density_test (rs6000_cost_data *data) free (bbs); density_pct = (vec_cost * 100) / (vec_cost + not_vec_cost); - if (density_pct > DENSITY_PCT_THRESHOLD - && vec_cost + not_vec_cost > DENSITY_SIZE_THRESHOLD) + if (density_pct > rs6000_density_pct_threshold + && vec_cost + not_vec_cost > rs6000_density_size_threshold) { - data->cost[vect_body] = vec_cost * (100 + DENSITY_PENALTY) / 100; + data->cost[vect_body] = vec_cost * (100 + rs6000_density_penalty) / 100; if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "density %d%%, cost %d exceeds threshold, penalizing " - "loop body cost by %d%%\n", density_pct, - vec_cost + not_vec_cost, DENSITY_PENALTY); + "loop body cost by %u%%\n", density_pct, + vec_cost + not_vec_cost, rs6000_density_penalty); } /* Check whether we need to penalize the body cost to account for excess strided or elementwise loads. */ if (data->extra_ctor_cost > 0) { - /* Threshold for load stmts percentage in all vectorized stmts. */ - const int DENSITY_LOAD_PCT_THRESHOLD = 45; - /* Threshold for total number of load stmts. */ - const int DENSITY_LOAD_NUM_THRESHOLD = 20; - gcc_assert (data->nloads <= data->nstmts); unsigned int load_pct = (data->nloads * 100) / data->nstmts; @@ -5360,8 +5352,8 @@ rs6000_density_test (rs6000_cost_data *data) the loads. One typical case is the innermost loop of the hotspot of SPEC2017 503.bwaves_r without loop interchange. */ - if (data->nloads > DENSITY_LOAD_NUM_THRESHOLD - && load_pct > DENSITY_LOAD_PCT_THRESHOLD) + if (data->nloads > (unsigned int) rs6000_density_load_num_threshold + && load_pct > (unsigned int) rs6000_density_load_pct_threshold) { data->cost[vect_body] += data->extra_ctor_cost; if (dump_enabled_p ()) diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt index c1cb9ab..9d7878f 100644 --- a/gcc/config/rs6000/rs6000.opt +++ b/gcc/config/rs6000/rs6000.opt @@ -639,3 +639,41 @@ Enable instructions that guard against return-oriented programming attacks. mprivileged Target Var(rs6000_privileged) Init(0) Generate code that will run in privileged state. + +-param=rs6000-density-pct-threshold= +Target Undocumented Joined UInteger Var(rs6000_density_pct_threshold) Init(85) IntegerRange(0, 100) Param +When costing for loop vectorization, we probably need to penalize the loop body +cost if the existing cost model may not adequately reflect delays from +unavailable vector resources. We collect the cost for vectorized statements +and non-vectorized statements separately, check the proportion of vec_cost to +total cost of vec_cost and non vec_cost, and penalize only if the proportion +exceeds the threshold specified by this parameter. The default value is 85. + +-param=rs6000-density-size-threshold= +Target Undocumented Joined UInteger Var(rs6000_density_size_threshold) Init(70) IntegerRange(0, 1000) Param +Like parameter rs6000-density-pct-threshold, we also check the total sum of +vec_cost and non vec_cost, and penalize only if the sum exceeds the threshold +specified by this parameter. The default value is 70. + +-param=rs6000-density-penalty= +Target Undocumented Joined UInteger Var(rs6000_density_penalty) Init(10) IntegerRange(0, 1000) Param +When both heuristics with rs6000-density-pct-threshold and +rs6000-density-size-threshold are satisfied, we decide to penalize the loop +body cost by the value which is specified by this parameter. The default +value is 10. + +-param=rs6000-density-load-pct-threshold= +Target Undocumented Joined UInteger Var(rs6000_density_load_pct_threshold) Init(45) IntegerRange(0, 100) Param +When costing for loop vectorization, we probably need to penalize the loop body +cost by accounting for excess strided or elementwise loads. We collect the +numbers for general statements and load statements according to the information +for statements to be vectorized, check the proportion of load statements, and +penalize only if the proportion exceeds the threshold specified by this +parameter. The default value is 45. + +-param=rs6000-density-load-num-threshold= +Target Undocumented Joined UInteger Var(rs6000_density_load_num_threshold) Init(20) IntegerRange(0, 1000) Param +Like parameter rs6000-density-load-pct-threshold, we also check if the total +number of load statements exceeds the threshold specified by this parameter, +and penalize only if it's satisfied. The default value is 20. + diff --git a/gcc/config/rs6000/vxworks.h b/gcc/config/rs6000/vxworks.h index 5facbbb..d8ecc02 100644 --- a/gcc/config/rs6000/vxworks.h +++ b/gcc/config/rs6000/vxworks.h @@ -147,10 +147,6 @@ along with GCC; see the file COPYING3. If not see #undef FUNCTION_PROFILER #define FUNCTION_PROFILER(FILE,LABELNO) VXWORKS_FUNCTION_PROFILER(FILE,LABELNO) -/* Initialize library function table. */ -#undef TARGET_INIT_LIBFUNCS -#define TARGET_INIT_LIBFUNCS rs6000_vxworks_init_libfuncs - /* Nor sdata, for kernel mode. We use this in SUBSUBTARGET_INITIALIZE_OPTIONS, after rs6000_rtp has been initialized. */ #undef SDATA_DEFAULT_SIZE diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index 54dd633..e043854 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -6414,6 +6414,15 @@ s390_expand_insv (rtx dest, rtx op1, rtx op2, rtx src) if (bitsize + bitpos > GET_MODE_BITSIZE (mode)) return false; + /* Just a move. */ + if (bitpos == 0 + && bitsize == GET_MODE_BITSIZE (GET_MODE (src)) + && mode == GET_MODE (src)) + { + emit_move_insn (dest, src); + return true; + } + /* Generate INSERT IMMEDIATE (IILL et al). */ /* (set (ze (reg)) (const_int)). */ if (TARGET_ZARCH @@ -6510,6 +6519,7 @@ s390_expand_insv (rtx dest, rtx op1, rtx op2, rtx src) && (bitpos & 32) == ((bitpos + bitsize - 1) & 32) && MEM_P (src) && (mode == DImode || mode == SImode) + && mode != smode && register_operand (dest, mode)) { /* Emit a strict_low_part pattern if possible. */ diff --git a/gcc/config/s390/tpf.md b/gcc/config/s390/tpf.md index 297e9d1..35b3719 100644 --- a/gcc/config/s390/tpf.md +++ b/gcc/config/s390/tpf.md @@ -21,7 +21,8 @@ [(unspec_volatile [(match_operand 0 "const_int_operand" "J") (match_operand 1 "const_int_operand" "J")] UNSPECV_TPF_PROLOGUE) - (clobber (reg:DI 1))] + (clobber (reg:DI 1)) + (clobber (reg:CC CC_REGNUM))] "TARGET_TPF_PROFILING" "larl\t%%r1,.+14\;tm\t%0,255\;bnz\t%1" [(set_attr "length" "14")]) @@ -31,7 +32,8 @@ [(unspec_volatile [(match_operand 0 "const_int_operand" "J") (match_operand 1 "const_int_operand" "J")] UNSPECV_TPF_EPILOGUE) - (clobber (reg:DI 1))] + (clobber (reg:DI 1)) + (clobber (reg:CC CC_REGNUM))] "TARGET_TPF_PROFILING" "larl\t%%r1,.+14\;tm\t%0,255\;bnz\t%1" [(set_attr "length" "14")]) |