diff options
author | Ian Lance Taylor <iant@golang.org> | 2021-02-12 11:38:19 -0800 |
---|---|---|
committer | Ian Lance Taylor <iant@golang.org> | 2021-02-12 11:38:19 -0800 |
commit | 89d7be42db00cd0953e7d4584877cf50a56ed046 (patch) | |
tree | 3a471e8ee60b7be687ab7501f70379618adcf174 /gcc/config | |
parent | 305e9d2c7815e90a29bbde1e3a7cd776861f4d7c (diff) | |
parent | 9769564e7456453e2273071d0faa5aab2554ff78 (diff) | |
download | gcc-89d7be42db00cd0953e7d4584877cf50a56ed046.zip gcc-89d7be42db00cd0953e7d4584877cf50a56ed046.tar.gz gcc-89d7be42db00cd0953e7d4584877cf50a56ed046.tar.bz2 |
Merge from trunk revision 9769564e7456453e2273071d0faa5aab2554ff78.
Diffstat (limited to 'gcc/config')
-rw-r--r-- | gcc/config/aarch64/aarch64-cost-tables.h | 18 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-simd-builtins.def | 13 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-simd.md | 113 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64.c | 12 | ||||
-rw-r--r-- | gcc/config/aarch64/arm_neon.h | 244 | ||||
-rw-r--r-- | gcc/config/arm/aarch-common-protos.h | 1 | ||||
-rw-r--r-- | gcc/config/arm/aarch-cost-tables.h | 18 | ||||
-rw-r--r-- | gcc/config/arm/arm.c | 21 | ||||
-rw-r--r-- | gcc/config/arm/thumb2.md | 28 | ||||
-rw-r--r-- | gcc/config/i386/i386-expand.c | 43 | ||||
-rw-r--r-- | gcc/config/i386/i386-options.c | 2 | ||||
-rw-r--r-- | gcc/config/i386/sse.md | 11 | ||||
-rw-r--r-- | gcc/config/i386/winnt.c | 4 | ||||
-rw-r--r-- | gcc/config/i386/x86-tune.def | 10 | ||||
-rw-r--r-- | gcc/config/nvptx/nvptx.c | 6 | ||||
-rw-r--r-- | gcc/config/rs6000/predicates.md | 4 |
16 files changed, 343 insertions, 205 deletions
diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h index c309f88..dd2e7e7 100644 --- a/gcc/config/aarch64/aarch64-cost-tables.h +++ b/gcc/config/aarch64/aarch64-cost-tables.h @@ -123,7 +123,8 @@ const struct cpu_cost_table qdf24xx_extra_costs = }, /* Vector */ { - COSTS_N_INSNS (1) /* alu. */ + COSTS_N_INSNS (1), /* alu. */ + COSTS_N_INSNS (4) /* mult. */ } }; @@ -227,7 +228,8 @@ const struct cpu_cost_table thunderx_extra_costs = }, /* Vector */ { - COSTS_N_INSNS (1) /* Alu. */ + COSTS_N_INSNS (1), /* Alu. */ + COSTS_N_INSNS (4) /* mult. */ } }; @@ -330,7 +332,8 @@ const struct cpu_cost_table thunderx2t99_extra_costs = }, /* Vector */ { - COSTS_N_INSNS (1) /* Alu. */ + COSTS_N_INSNS (1), /* Alu. */ + COSTS_N_INSNS (4) /* Mult. */ } }; @@ -433,7 +436,8 @@ const struct cpu_cost_table thunderx3t110_extra_costs = }, /* Vector */ { - COSTS_N_INSNS (1) /* Alu. */ + COSTS_N_INSNS (1), /* Alu. */ + COSTS_N_INSNS (4) /* Mult. */ } }; @@ -537,7 +541,8 @@ const struct cpu_cost_table tsv110_extra_costs = }, /* Vector */ { - COSTS_N_INSNS (1) /* alu. */ + COSTS_N_INSNS (1), /* alu. */ + COSTS_N_INSNS (4) /* mult. */ } }; @@ -640,7 +645,8 @@ const struct cpu_cost_table a64fx_extra_costs = }, /* Vector */ { - COSTS_N_INSNS (1) /* alu. */ + COSTS_N_INSNS (1), /* alu. */ + COSTS_N_INSNS (4) /* mult. */ } }; diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index b787cb9..b885bd5 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -55,6 +55,11 @@ BUILTIN_VS (UNOP, ctz, 2, NONE) BUILTIN_VB (UNOP, popcount, 2, NONE) + /* Implemented by aarch64_get_low<mode>. */ + BUILTIN_VQMOV (UNOP, get_low, 0, AUTO_FP) + /* Implemented by aarch64_get_high<mode>. */ + BUILTIN_VQMOV (UNOP, get_high, 0, AUTO_FP) + /* Implemented by aarch64_<sur>q<r>shl<mode>. */ BUILTIN_VSDQ_I (BINOP, sqshl, 0, NONE) BUILTIN_VSDQ_I (BINOP_UUS, uqshl, 0, NONE) @@ -300,6 +305,14 @@ BUILTIN_VD_HSI (BINOP, smull_n, 0, NONE) BUILTIN_VD_HSI (BINOPU, umull_n, 0, NONE) + BUILTIN_VQ_HSI (BINOP, smull_hi_n, 0, NONE) + BUILTIN_VQ_HSI (BINOPU, umull_hi_n, 0, NONE) + + BUILTIN_VQ_HSI (TERNOP_LANE, smull_hi_lane, 0, NONE) + BUILTIN_VQ_HSI (TERNOP_LANE, smull_hi_laneq, 0, NONE) + BUILTIN_VQ_HSI (TERNOPU_LANE, umull_hi_lane, 0, NONE) + BUILTIN_VQ_HSI (TERNOPU_LANE, umull_hi_laneq, 0, NONE) + BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_lane_, 0, NONE) BUILTIN_VD_HSI (QUADOP_LANE, vec_smlal_lane_, 0, NONE) BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_laneq_, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 393bab1..71aa77d 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -297,6 +297,28 @@ "TARGET_SIMD" ) +(define_expand "aarch64_get_low<mode>" + [(match_operand:<VHALF> 0 "register_operand") + (match_operand:VQMOV 1 "register_operand")] + "TARGET_SIMD" + { + rtx lo = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false); + emit_insn (gen_aarch64_get_half<mode> (operands[0], operands[1], lo)); + DONE; + } +) + +(define_expand "aarch64_get_high<mode>" + [(match_operand:<VHALF> 0 "register_operand") + (match_operand:VQMOV 1 "register_operand")] + "TARGET_SIMD" + { + rtx hi = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); + emit_insn (gen_aarch64_get_half<mode> (operands[0], operands[1], hi)); + DONE; + } +) + (define_insn_and_split "aarch64_simd_mov_from_<mode>low" [(set (match_operand:<VHALF> 0 "register_operand" "=w,?r") (vec_select:<VHALF> @@ -2253,6 +2275,70 @@ [(set_attr "type" "neon_mul_<Vetype>_scalar_long")] ) +(define_insn "aarch64_<su>mull_hi_lane<mode>_insn" + [(set (match_operand:<VWIDE> 0 "register_operand" "=w") + (mult:<VWIDE> + (ANY_EXTEND:<VWIDE> (vec_select:<VHALF> + (match_operand:VQ_HSI 1 "register_operand" "w") + (match_operand:VQ_HSI 2 "vect_par_cnst_hi_half" ""))) + (ANY_EXTEND:<VWIDE> (vec_duplicate:<VHALF> + (vec_select:<VEL> + (match_operand:<VCOND> 3 "register_operand" "<vwx>") + (parallel [(match_operand:SI 4 "immediate_operand" "i")]))))))] + "TARGET_SIMD" + { + operands[4] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[4])); + return "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %3.<Vetype>[%4]"; + } + [(set_attr "type" "neon_mul_<Vetype>_scalar_long")] +) + +(define_expand "aarch64_<su>mull_hi_lane<mode>" + [(match_operand:<VWIDE> 0 "register_operand") + (ANY_EXTEND:<VWIDE>(match_operand:VQ_HSI 1 "register_operand")) + (match_operand:<VCOND> 2 "register_operand") + (match_operand:SI 3 "immediate_operand")] + "TARGET_SIMD" +{ + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); + emit_insn (gen_aarch64_<su>mull_hi_lane<mode>_insn (operands[0], + operands[1], p, operands[2], operands[3])); + DONE; +} +) + +(define_insn "aarch64_<su>mull_hi_laneq<mode>_insn" + [(set (match_operand:<VWIDE> 0 "register_operand" "=w") + (mult:<VWIDE> + (ANY_EXTEND:<VWIDE> (vec_select:<VHALF> + (match_operand:VQ_HSI 1 "register_operand" "w") + (match_operand:VQ_HSI 2 "vect_par_cnst_hi_half" ""))) + (ANY_EXTEND:<VWIDE> (vec_duplicate:<VHALF> + (vec_select:<VEL> + (match_operand:<VCONQ> 3 "register_operand" "<vwx>") + (parallel [(match_operand:SI 4 "immediate_operand" "i")]))))))] + "TARGET_SIMD" + { + operands[4] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[4])); + return "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %3.<Vetype>[%4]"; + } + [(set_attr "type" "neon_mul_<Vetype>_scalar_long")] +) + +(define_expand "aarch64_<su>mull_hi_laneq<mode>" + [(match_operand:<VWIDE> 0 "register_operand") + (ANY_EXTEND:<VWIDE>(match_operand:VQ_HSI 1 "register_operand")) + (match_operand:<VCONQ> 2 "register_operand") + (match_operand:SI 3 "immediate_operand")] + "TARGET_SIMD" +{ + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); + emit_insn (gen_aarch64_<su>mull_hi_laneq<mode>_insn (operands[0], + operands[1], p, operands[2], operands[3])); + DONE; +} +) + (define_insn "aarch64_<su>mull_n<mode>" [(set (match_operand:<VWIDE> 0 "register_operand" "=w") (mult:<VWIDE> @@ -2266,6 +2352,33 @@ [(set_attr "type" "neon_mul_<Vetype>_scalar_long")] ) +(define_insn "aarch64_<su>mull_hi_n<mode>_insn" + [(set (match_operand:<VWIDE> 0 "register_operand" "=w") + (mult:<VWIDE> + (ANY_EXTEND:<VWIDE> (vec_select:<VHALF> + (match_operand:VQ_HSI 1 "register_operand" "w") + (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" ""))) + (ANY_EXTEND:<VWIDE> + (vec_duplicate:<VCOND> + (match_operand:<VEL> 2 "register_operand" "<h_con>")))))] + "TARGET_SIMD" + "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[0]" + [(set_attr "type" "neon_mul_<Vetype>_scalar_long")] +) + +(define_expand "aarch64_<su>mull_hi_n<mode>" + [(match_operand:<VWIDE> 0 "register_operand") + (ANY_EXTEND:<VWIDE> (match_operand:VQ_HSI 1 "register_operand")) + (match_operand:<VEL> 2 "register_operand")] + "TARGET_SIMD" + { + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); + emit_insn (gen_aarch64_<su>mull_hi_n<mode>_insn (operands[0], operands[1], + operands[2], p)); + DONE; + } +) + ;; vmlal_lane_s16 intrinsics (define_insn "aarch64_vec_<su>mlal_lane<Qlane>" [(set (match_operand:<VWIDE> 0 "register_operand" "=w") diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index b6192e5..146ed8c 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -11568,7 +11568,6 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed) if (VECTOR_MODE_P (mode)) { unsigned int vec_flags = aarch64_classify_vector_mode (mode); - mode = GET_MODE_INNER (mode); if (vec_flags & VEC_ADVSIMD) { /* The by-element versions of the instruction have the same costs as @@ -11582,6 +11581,17 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed) else if (GET_CODE (op1) == VEC_DUPLICATE) op1 = XEXP (op1, 0); } + cost += rtx_cost (op0, mode, MULT, 0, speed); + cost += rtx_cost (op1, mode, MULT, 1, speed); + if (speed) + { + if (GET_CODE (x) == MULT) + cost += extra_cost->vect.mult; + /* This is to catch the SSRA costing currently flowing here. */ + else + cost += extra_cost->vect.alu; + } + return cost; } /* Integer multiply/fma. */ diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index d50bd65..baa30bd 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -6302,216 +6302,203 @@ vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, const int __index) return __aarch64_vset_lane_any (__elem, __vec, __index); } -#define __GET_LOW(__TYPE) \ - uint64x2_t tmp = vreinterpretq_u64_##__TYPE (__a); \ - uint64x1_t lo = vcreate_u64 (vgetq_lane_u64 (tmp, 0)); \ - return vreinterpret_##__TYPE##_u64 (lo); - __extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_f16 (float16x8_t __a) { - __GET_LOW (f16); + return __builtin_aarch64_get_lowv8hf (__a); } __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_f32 (float32x4_t __a) { - __GET_LOW (f32); + return __builtin_aarch64_get_lowv4sf (__a); } __extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_f64 (float64x2_t __a) { - return (float64x1_t) {vgetq_lane_f64 (__a, 0)}; + return (float64x1_t) {__builtin_aarch64_get_lowv2df (__a)}; } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_p8 (poly8x16_t __a) { - __GET_LOW (p8); + return (poly8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a); } __extension__ extern __inline poly16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_p16 (poly16x8_t __a) { - __GET_LOW (p16); + return (poly16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a); } __extension__ extern __inline poly64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_p64 (poly64x2_t __a) { - __GET_LOW (p64); + return (poly64x1_t) __builtin_aarch64_get_lowv2di ((int64x2_t) __a); } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_s8 (int8x16_t __a) { - __GET_LOW (s8); + return __builtin_aarch64_get_lowv16qi (__a); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_s16 (int16x8_t __a) { - __GET_LOW (s16); + return __builtin_aarch64_get_lowv8hi (__a); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_s32 (int32x4_t __a) { - __GET_LOW (s32); + return __builtin_aarch64_get_lowv4si (__a); } __extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_s64 (int64x2_t __a) { - __GET_LOW (s64); + return (int64x1_t) {__builtin_aarch64_get_lowv2di (__a)}; } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_u8 (uint8x16_t __a) { - __GET_LOW (u8); + return (uint8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_u16 (uint16x8_t __a) { - __GET_LOW (u16); + return (uint16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_u32 (uint32x4_t __a) { - __GET_LOW (u32); + return (uint32x2_t) __builtin_aarch64_get_lowv4si ((int32x4_t) __a); } __extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_u64 (uint64x2_t __a) { - return vcreate_u64 (vgetq_lane_u64 (__a, 0)); + return (uint64x1_t) {__builtin_aarch64_get_lowv2di ((int64x2_t) __a)}; } -#undef __GET_LOW - -#define __GET_HIGH(__TYPE) \ - uint64x2_t tmp = vreinterpretq_u64_##__TYPE (__a); \ - uint64x1_t hi = vcreate_u64 (vgetq_lane_u64 (tmp, 1)); \ - return vreinterpret_##__TYPE##_u64 (hi); - __extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_f16 (float16x8_t __a) { - __GET_HIGH (f16); + return __builtin_aarch64_get_highv8hf (__a); } __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_f32 (float32x4_t __a) { - __GET_HIGH (f32); + return __builtin_aarch64_get_highv4sf (__a); } __extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_f64 (float64x2_t __a) { - __GET_HIGH (f64); + return (float64x1_t) {__builtin_aarch64_get_highv2df (__a)}; } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_p8 (poly8x16_t __a) { - __GET_HIGH (p8); + return (poly8x8_t) __builtin_aarch64_get_highv16qi ((int8x16_t) __a); } __extension__ extern __inline poly16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_p16 (poly16x8_t __a) { - __GET_HIGH (p16); + return (poly16x4_t) __builtin_aarch64_get_highv8hi ((int16x8_t) __a); } __extension__ extern __inline poly64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_p64 (poly64x2_t __a) { - __GET_HIGH (p64); + return (poly64x1_t) __builtin_aarch64_get_highv2di ((int64x2_t) __a); } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_s8 (int8x16_t __a) { - __GET_HIGH (s8); + return __builtin_aarch64_get_highv16qi (__a); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_s16 (int16x8_t __a) { - __GET_HIGH (s16); + return __builtin_aarch64_get_highv8hi (__a); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_s32 (int32x4_t __a) { - __GET_HIGH (s32); + return __builtin_aarch64_get_highv4si (__a); } __extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_s64 (int64x2_t __a) { - __GET_HIGH (s64); + return (int64x1_t) {__builtin_aarch64_get_highv2di (__a)}; } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_u8 (uint8x16_t __a) { - __GET_HIGH (u8); + return (uint8x8_t) __builtin_aarch64_get_highv16qi ((int8x16_t) __a); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_u16 (uint16x8_t __a) { - __GET_HIGH (u16); + return (uint16x4_t) __builtin_aarch64_get_highv8hi ((int16x8_t) __a); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_u32 (uint32x4_t __a) { - __GET_HIGH (u32); + return (uint32x2_t) __builtin_aarch64_get_highv4si ((int32x4_t) __a); } -#undef __GET_HIGH - __extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_u64 (uint64x2_t __a) { - return vcreate_u64 (vgetq_lane_u64 (__a, 1)); + return (uint64x1_t) {__builtin_aarch64_get_highv2di ((int64x2_t) __a)}; } + __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcombine_s8 (int8x8_t __a, int8x8_t __b) @@ -8167,156 +8154,89 @@ vshrn_n_u64 (uint64x2_t __a, const int __b) { return (uint32x2_t)__builtin_aarch64_shrnv2di ((int64x2_t)__a, __b); } -#define vmull_high_lane_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x4_t b_ = (b); \ - int16x8_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) -#define vmull_high_lane_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x2_t b_ = (b); \ - int32x4_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_lane_s16 (int16x8_t __a, int16x4_t __v, const int __lane) +{ + return __builtin_aarch64_smull_hi_lanev8hi (__a, __v, __lane); +} -#define vmull_high_lane_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x4_t b_ = (b); \ - uint16x8_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_lane_s32 (int32x4_t __a, int32x2_t __v, const int __lane) +{ + return __builtin_aarch64_smull_hi_lanev4si (__a, __v, __lane); +} -#define vmull_high_lane_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x2_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_lane_u16 (uint16x8_t __a, uint16x4_t __v, const int __lane) +{ + return __builtin_aarch64_umull_hi_lanev8hi_uuus (__a, __v, __lane); +} -#define vmull_high_laneq_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x8_t b_ = (b); \ - int16x8_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_lane_u32 (uint32x4_t __a, uint32x2_t __v, const int __lane) +{ + return __builtin_aarch64_umull_hi_lanev4si_uuus (__a, __v, __lane); +} -#define vmull_high_laneq_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_laneq_s16 (int16x8_t __a, int16x8_t __v, const int __lane) +{ + return __builtin_aarch64_smull_hi_laneqv8hi (__a, __v, __lane); +} -#define vmull_high_laneq_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x8_t b_ = (b); \ - uint16x8_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_laneq_s32 (int32x4_t __a, int32x4_t __v, const int __lane) +{ + return __builtin_aarch64_smull_hi_laneqv4si (__a, __v, __lane); +} -#define vmull_high_laneq_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_laneq_u16 (uint16x8_t __a, uint16x8_t __v, const int __lane) +{ + return __builtin_aarch64_umull_hi_laneqv8hi_uuus (__a, __v, __lane); +} + +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_laneq_u32 (uint32x4_t __a, uint32x4_t __v, const int __lane) +{ + return __builtin_aarch64_umull_hi_laneqv4si_uuus (__a, __v, __lane); +} __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmull_high_n_s16 (int16x8_t __a, int16_t __b) { - int32x4_t __result; - __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]" - : "=w"(__result) - : "w"(__a), "x"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_smull_hi_nv8hi (__a, __b); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmull_high_n_s32 (int32x4_t __a, int32_t __b) { - int64x2_t __result; - __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_smull_hi_nv4si (__a, __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmull_high_n_u16 (uint16x8_t __a, uint16_t __b) { - uint32x4_t __result; - __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]" - : "=w"(__result) - : "w"(__a), "x"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_umull_hi_nv8hi_uuu (__a, __b); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmull_high_n_u32 (uint32x4_t __a, uint32_t __b) { - uint64x2_t __result; - __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_umull_hi_nv4si_uuu (__a, __b); } __extension__ extern __inline poly16x8_t diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h index 251de3d..7a9cf3d 100644 --- a/gcc/config/arm/aarch-common-protos.h +++ b/gcc/config/arm/aarch-common-protos.h @@ -132,6 +132,7 @@ struct fp_cost_table struct vector_cost_table { const int alu; + const int mult; }; struct cpu_cost_table diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h index d4baee4..25ff702 100644 --- a/gcc/config/arm/aarch-cost-tables.h +++ b/gcc/config/arm/aarch-cost-tables.h @@ -121,7 +121,8 @@ const struct cpu_cost_table generic_extra_costs = }, /* Vector */ { - COSTS_N_INSNS (1) /* alu. */ + COSTS_N_INSNS (1), /* alu. */ + COSTS_N_INSNS (4) /* mult. */ } }; @@ -224,7 +225,8 @@ const struct cpu_cost_table cortexa53_extra_costs = }, /* Vector */ { - COSTS_N_INSNS (1) /* alu. */ + COSTS_N_INSNS (1), /* alu. */ + COSTS_N_INSNS (4) /* mult. */ } }; @@ -327,7 +329,8 @@ const struct cpu_cost_table cortexa57_extra_costs = }, /* Vector */ { - COSTS_N_INSNS (1) /* alu. */ + COSTS_N_INSNS (1), /* alu. */ + COSTS_N_INSNS (4) /* mult. */ } }; @@ -430,7 +433,8 @@ const struct cpu_cost_table cortexa76_extra_costs = }, /* Vector */ { - COSTS_N_INSNS (1) /* alu. */ + COSTS_N_INSNS (1), /* alu. */ + COSTS_N_INSNS (4) /* mult. */ } }; @@ -533,7 +537,8 @@ const struct cpu_cost_table exynosm1_extra_costs = }, /* Vector */ { - COSTS_N_INSNS (0) /* alu. */ + COSTS_N_INSNS (0), /* alu. */ + COSTS_N_INSNS (4) /* mult. */ } }; @@ -636,7 +641,8 @@ const struct cpu_cost_table xgene1_extra_costs = }, /* Vector */ { - COSTS_N_INSNS (2) /* alu. */ + COSTS_N_INSNS (2), /* alu. */ + COSTS_N_INSNS (8) /* mult. */ } }; diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index e22396d..d254f41 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -1192,7 +1192,8 @@ const struct cpu_cost_table cortexa9_extra_costs = }, /* Vector */ { - COSTS_N_INSNS (1) /* alu. */ + COSTS_N_INSNS (1), /* alu. */ + COSTS_N_INSNS (4) /* mult. */ } }; @@ -1295,7 +1296,8 @@ const struct cpu_cost_table cortexa8_extra_costs = }, /* Vector */ { - COSTS_N_INSNS (1) /* alu. */ + COSTS_N_INSNS (1), /* alu. */ + COSTS_N_INSNS (4) /* mult. */ } }; @@ -1399,7 +1401,8 @@ const struct cpu_cost_table cortexa5_extra_costs = }, /* Vector */ { - COSTS_N_INSNS (1) /* alu. */ + COSTS_N_INSNS (1), /* alu. */ + COSTS_N_INSNS (4) /* mult. */ } }; @@ -1504,7 +1507,8 @@ const struct cpu_cost_table cortexa7_extra_costs = }, /* Vector */ { - COSTS_N_INSNS (1) /* alu. */ + COSTS_N_INSNS (1), /* alu. */ + COSTS_N_INSNS (4) /* mult. */ } }; @@ -1607,7 +1611,8 @@ const struct cpu_cost_table cortexa12_extra_costs = }, /* Vector */ { - COSTS_N_INSNS (1) /* alu. */ + COSTS_N_INSNS (1), /* alu. */ + COSTS_N_INSNS (4) /* mult. */ } }; @@ -1710,7 +1715,8 @@ const struct cpu_cost_table cortexa15_extra_costs = }, /* Vector */ { - COSTS_N_INSNS (1) /* alu. */ + COSTS_N_INSNS (1), /* alu. */ + COSTS_N_INSNS (4) /* mult. */ } }; @@ -1813,7 +1819,8 @@ const struct cpu_cost_table v7m_extra_costs = }, /* Vector */ { - COSTS_N_INSNS (1) /* alu. */ + COSTS_N_INSNS (1), /* alu. */ + COSTS_N_INSNS (4) /* mult. */ } }; diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md index bd53bf3..d7fd96c 100644 --- a/gcc/config/arm/thumb2.md +++ b/gcc/config/arm/thumb2.md @@ -1711,15 +1711,27 @@ ;; Originally expanded by 'doloop_end'. (define_insn "*doloop_end_internal" - [(parallel [(set (pc) - (if_then_else - (ne (reg:SI LR_REGNUM) (const_int 1)) - (label_ref (match_operand 0 "" "")) - (pc))) - (set (reg:SI LR_REGNUM) - (plus:SI (reg:SI LR_REGNUM) (const_int -1)))])] + [(set (pc) + (if_then_else + (ne (reg:SI LR_REGNUM) (const_int 1)) + (label_ref (match_operand 0 "" "")) + (pc))) + (set (reg:SI LR_REGNUM) + (plus:SI (reg:SI LR_REGNUM) (const_int -1))) + (clobber (reg:CC CC_REGNUM))] "TARGET_32BIT && TARGET_HAVE_LOB" - "le\t%|lr, %l0") + { + if (get_attr_length (insn) == 4) + return "le\t%|lr, %l0"; + else + return "subs\t%|lr, #1;bne\t%l0"; + } + [(set (attr "length") + (if_then_else + (ltu (minus (pc) (match_dup 0)) (const_int 1024)) + (const_int 4) + (const_int 6))) + (set_attr "type" "branch")]) (define_expand "doloop_begin" [(match_operand 0 "" "") diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index d64b4ac..02d3142 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -3469,6 +3469,33 @@ ix86_valid_mask_cmp_mode (machine_mode mode) return vector_size == 64 || TARGET_AVX512VL; } +/* Return true if integer mask comparison should be used. */ +static bool +ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode, + rtx op_true, rtx op_false) +{ + if (GET_MODE_SIZE (mode) == 64) + return true; + + /* When op_true is NULL, op_false must be NULL, or vice versa. */ + gcc_assert (!op_true == !op_false); + + /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode, + vector dest is required. */ + if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode)) + return false; + + /* Exclude those that could be optimized in ix86_expand_sse_movcc. */ + if (op_false == CONST0_RTX (mode) + || op_true == CONST0_RTX (mode) + || (INTEGRAL_MODE_P (mode) + && (op_true == CONSTM1_RTX (mode) + || op_false == CONSTM1_RTX (mode)))) + return false; + + return true; +} + /* Expand an SSE comparison. Return the register with the result. */ static rtx @@ -3485,7 +3512,7 @@ ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1, bool maskcmp = false; rtx x; - if (ix86_valid_mask_cmp_mode (cmp_ops_mode)) + if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false)) { unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode); maskcmp = true; @@ -3517,7 +3544,7 @@ ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1, x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1); - if (cmp_mode != mode && !maskcmp) + if (cmp_mode != mode) { x = force_reg (cmp_ops_mode, x); convert_move (dest, x, false); @@ -3544,9 +3571,6 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) return; } - /* In AVX512F the result of comparison is an integer mask. */ - bool maskcmp = mode != cmpmode && ix86_valid_mask_cmp_mode (mode); - rtx t2, t3, x; /* If we have an integer mask and FP value then we need @@ -3557,8 +3581,11 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) cmp = gen_rtx_SUBREG (mode, cmp, 0); } - if (maskcmp) + /* In AVX512F the result of comparison is an integer mask. */ + if (mode != cmpmode + && GET_MODE_CLASS (cmpmode) == MODE_INT) { + gcc_assert (ix86_valid_mask_cmp_mode (mode)); /* Using vector move with mask register. */ cmp = force_reg (cmpmode, cmp); /* Optimize for mask zero. */ @@ -4016,7 +4043,7 @@ ix86_expand_fp_vec_cmp (rtx operands[]) } else cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3], - operands[1], operands[2]); + NULL, NULL); if (operands[0] != cmp) emit_move_insn (operands[0], cmp); @@ -4041,7 +4068,7 @@ ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1, ; /* AVX512F supports all of the comparsions on all 128/256/512-bit vector int types. */ - else if (ix86_valid_mask_cmp_mode (mode)) + else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false)) ; else { diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index a70f6ed..cdeabbf 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -98,6 +98,8 @@ along with GCC; see the file COPYING3. If not see #endif /* Processor feature/optimization bitmasks. */ +#define m_NONE HOST_WIDE_INT_0U +#define m_ALL (~HOST_WIDE_INT_0U) #define m_386 (HOST_WIDE_INT_1U<<PROCESSOR_I386) #define m_486 (HOST_WIDE_INT_1U<<PROCESSOR_I486) #define m_PENT (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUM) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 369a00d..db5be59 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -6356,6 +6356,7 @@ (match_operand:V2SF 1 "register_operand")))] "TARGET_AVX512DQ && TARGET_AVX512VL" { + operands[1] = force_reg (V2SFmode, operands[1]); operands[1] = simplify_gen_subreg (V4SFmode, operands[1], V2SFmode, 0); emit_insn (gen_avx512dq_fix<fixunssuffix>_truncv2sfv2di2 (operands[0], operands[1])); @@ -18013,6 +18014,7 @@ { if (!MEM_P (operands[1])) { + operands[1] = force_reg (V8QImode, operands[1]); operands[1] = simplify_gen_subreg (V16QImode, operands[1], V8QImode, 0); emit_insn (gen_sse4_1_<code>v8qiv8hi2 (operands[0], operands[1])); DONE; @@ -18090,6 +18092,7 @@ { if (!MEM_P (operands[1])) { + operands[1] = force_reg (V8QImode, operands[1]); operands[1] = simplify_gen_subreg (V16QImode, operands[1], V8QImode, 0); emit_insn (gen_avx2_<code>v8qiv8si2 (operands[0], operands[1])); DONE; @@ -18153,6 +18156,7 @@ { if (!MEM_P (operands[1])) { + operands[1] = force_reg (V4QImode, operands[1]); operands[1] = simplify_gen_subreg (V16QImode, operands[1], V4QImode, 0); emit_insn (gen_sse4_1_<code>v4qiv4si2 (operands[0], operands[1])); DONE; @@ -18279,6 +18283,7 @@ { if (!MEM_P (operands[1])) { + operands[1] = force_reg (V4HImode, operands[1]); operands[1] = simplify_gen_subreg (V8HImode, operands[1], V4HImode, 0); emit_insn (gen_sse4_1_<code>v4hiv4si2 (operands[0], operands[1])); DONE; @@ -18366,6 +18371,7 @@ { if (!MEM_P (operands[1])) { + operands[1] = force_reg (V8QImode, operands[1]); operands[1] = simplify_gen_subreg (V16QImode, operands[1], V8QImode, 0); emit_insn (gen_avx512f_<code>v8qiv8di2 (operands[0], operands[1])); DONE; @@ -18427,6 +18433,7 @@ { if (!MEM_P (operands[1])) { + operands[1] = force_reg (V8QImode, operands[1]); operands[1] = simplify_gen_subreg (V16QImode, operands[1], V8QImode, 0); emit_insn (gen_avx2_<code>v4qiv4di2 (operands[0], operands[1])); DONE; @@ -18453,6 +18460,7 @@ (match_operand:V2QI 1 "register_operand")))] "TARGET_SSE4_1" { + operands[1] = force_reg (V2QImode, operands[1]); operands[1] = simplify_gen_subreg (V16QImode, operands[1], V2QImode, 0); emit_insn (gen_sse4_1_<code>v2qiv2di2 (operands[0], operands[1])); DONE; @@ -18525,6 +18533,7 @@ { if (!MEM_P (operands[1])) { + operands[1] = force_reg (V4HImode, operands[1]); operands[1] = simplify_gen_subreg (V8HImode, operands[1], V4HImode, 0); emit_insn (gen_avx2_<code>v4hiv4di2 (operands[0], operands[1])); DONE; @@ -18586,6 +18595,7 @@ { if (!MEM_P (operands[1])) { + operands[1] = force_reg (V2HImode, operands[1]); operands[1] = simplify_gen_subreg (V8HImode, operands[1], V2HImode, 0); emit_insn (gen_sse4_1_<code>v2hiv2di2 (operands[0], operands[1])); DONE; @@ -18737,6 +18747,7 @@ { if (!MEM_P (operands[1])) { + operands[1] = force_reg (V2SImode, operands[1]); operands[1] = simplify_gen_subreg (V4SImode, operands[1], V2SImode, 0); emit_insn (gen_sse4_1_<code>v2siv2di2 (operands[0], operands[1])); DONE; diff --git a/gcc/config/i386/winnt.c b/gcc/config/i386/winnt.c index 962c88e..adc3f36 100644 --- a/gcc/config/i386/winnt.c +++ b/gcc/config/i386/winnt.c @@ -1231,6 +1231,10 @@ i386_pe_seh_unwind_emit (FILE *asm_out_file, rtx_insn *insn) seh = cfun->machine->seh; if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_SWITCH_TEXT_SECTIONS) { + /* See ix86_seh_fixup_eh_fallthru for the rationale. */ + rtx_insn *prev = prev_active_insn (insn); + if (prev && !insn_nothrow_p (prev)) + fputs ("\tnop\n", asm_out_file); fputs ("\t.seh_endproc\n", asm_out_file); seh->in_cold_section = true; return; diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 7ace8da..140ccb3 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -453,12 +453,12 @@ DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVE /* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if false, unaligned loads are split. */ DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal", - ~(m_NEHALEM | m_SANDYBRIDGE | m_GENERIC)) + ~(m_NEHALEM | m_SANDYBRIDGE)) /* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if false, unaligned stores are split. */ DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal", - ~(m_NEHALEM | m_SANDYBRIDGE | m_BDVER | m_ZNVER1 | m_GENERIC)) + ~(m_NEHALEM | m_SANDYBRIDGE | m_BDVER | m_ZNVER1)) /* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX256 ops are split into two AVX128 ops. */ DEF_TUNE (X86_TUNE_AVX256_SPLIT_REGS, "avx256_split_regs",m_BDVER | m_BTVER2 @@ -580,15 +580,15 @@ DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode", on simulation result. But after P4 was made, no performance benefit was observed with branch hints. It also increases the code size. As a result, icc never generates branch hints. */ -DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", 0U) +DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", m_NONE) /* X86_TUNE_QIMODE_MATH: Enable use of 8bit arithmetic. */ -DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", ~0U) +DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", m_ALL) /* X86_TUNE_PROMOTE_QI_REGS: This enables generic code that promotes all 8bit arithmetic to 32bit via PROMOTE_MODE macro. This code generation scheme is usually used for RISC targets. */ -DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", 0U) +DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", m_NONE) /* X86_TUNE_EMIT_VZEROUPPER: This enables vzeroupper instruction insertion before a transfer of control flow out of the function. */ diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index f08b679..794c5a6 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -74,6 +74,7 @@ #include "cfgloop.h" #include "fold-const.h" #include "intl.h" +#include "opts.h" /* This file should be included last. */ #include "target-def.h" @@ -219,7 +220,10 @@ nvptx_option_override (void) flag_no_common = 1; /* The patch area requires nops, which we don't have. */ - if (function_entry_patch_area_size > 0) + HOST_WIDE_INT patch_area_size, patch_area_entry; + parse_and_check_patch_area (flag_patchable_function_entry, false, + &patch_area_size, &patch_area_entry); + if (patch_area_size > 0) sorry ("not generating patch area, nops not supported"); /* Assumes that it will see only hard registers. */ diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md index 76328ec..bd26c62 100644 --- a/gcc/config/rs6000/predicates.md +++ b/gcc/config/rs6000/predicates.md @@ -1156,7 +1156,9 @@ ;; Return 1 if this operand is valid for a MMA assemble accumulator insn. (define_special_predicate "mma_assemble_input_operand" (match_test "(mode == V16QImode - && (vsx_register_operand (op, mode) || MEM_P (op)))")) + && (vsx_register_operand (op, mode) + || (MEM_P (op) + && quad_address_p (XEXP (op, 0), mode, false))))")) ;; Return 1 if this operand is valid for an MMA disassemble insn. (define_predicate "mma_disassemble_output_operand" |