aboutsummaryrefslogtreecommitdiff
path: root/gcc/config
diff options
context:
space:
mode:
authorIan Lance Taylor <iant@golang.org>2021-02-12 11:38:19 -0800
committerIan Lance Taylor <iant@golang.org>2021-02-12 11:38:19 -0800
commit89d7be42db00cd0953e7d4584877cf50a56ed046 (patch)
tree3a471e8ee60b7be687ab7501f70379618adcf174 /gcc/config
parent305e9d2c7815e90a29bbde1e3a7cd776861f4d7c (diff)
parent9769564e7456453e2273071d0faa5aab2554ff78 (diff)
downloadgcc-89d7be42db00cd0953e7d4584877cf50a56ed046.zip
gcc-89d7be42db00cd0953e7d4584877cf50a56ed046.tar.gz
gcc-89d7be42db00cd0953e7d4584877cf50a56ed046.tar.bz2
Merge from trunk revision 9769564e7456453e2273071d0faa5aab2554ff78.
Diffstat (limited to 'gcc/config')
-rw-r--r--gcc/config/aarch64/aarch64-cost-tables.h18
-rw-r--r--gcc/config/aarch64/aarch64-simd-builtins.def13
-rw-r--r--gcc/config/aarch64/aarch64-simd.md113
-rw-r--r--gcc/config/aarch64/aarch64.c12
-rw-r--r--gcc/config/aarch64/arm_neon.h244
-rw-r--r--gcc/config/arm/aarch-common-protos.h1
-rw-r--r--gcc/config/arm/aarch-cost-tables.h18
-rw-r--r--gcc/config/arm/arm.c21
-rw-r--r--gcc/config/arm/thumb2.md28
-rw-r--r--gcc/config/i386/i386-expand.c43
-rw-r--r--gcc/config/i386/i386-options.c2
-rw-r--r--gcc/config/i386/sse.md11
-rw-r--r--gcc/config/i386/winnt.c4
-rw-r--r--gcc/config/i386/x86-tune.def10
-rw-r--r--gcc/config/nvptx/nvptx.c6
-rw-r--r--gcc/config/rs6000/predicates.md4
16 files changed, 343 insertions, 205 deletions
diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
index c309f88..dd2e7e7 100644
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -123,7 +123,8 @@ const struct cpu_cost_table qdf24xx_extra_costs =
},
/* Vector */
{
- COSTS_N_INSNS (1) /* alu. */
+ COSTS_N_INSNS (1), /* alu. */
+ COSTS_N_INSNS (4) /* mult. */
}
};
@@ -227,7 +228,8 @@ const struct cpu_cost_table thunderx_extra_costs =
},
/* Vector */
{
- COSTS_N_INSNS (1) /* Alu. */
+ COSTS_N_INSNS (1), /* Alu. */
+ COSTS_N_INSNS (4) /* mult. */
}
};
@@ -330,7 +332,8 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
},
/* Vector */
{
- COSTS_N_INSNS (1) /* Alu. */
+ COSTS_N_INSNS (1), /* Alu. */
+ COSTS_N_INSNS (4) /* Mult. */
}
};
@@ -433,7 +436,8 @@ const struct cpu_cost_table thunderx3t110_extra_costs =
},
/* Vector */
{
- COSTS_N_INSNS (1) /* Alu. */
+ COSTS_N_INSNS (1), /* Alu. */
+ COSTS_N_INSNS (4) /* Mult. */
}
};
@@ -537,7 +541,8 @@ const struct cpu_cost_table tsv110_extra_costs =
},
/* Vector */
{
- COSTS_N_INSNS (1) /* alu. */
+ COSTS_N_INSNS (1), /* alu. */
+ COSTS_N_INSNS (4) /* mult. */
}
};
@@ -640,7 +645,8 @@ const struct cpu_cost_table a64fx_extra_costs =
},
/* Vector */
{
- COSTS_N_INSNS (1) /* alu. */
+ COSTS_N_INSNS (1), /* alu. */
+ COSTS_N_INSNS (4) /* mult. */
}
};
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index b787cb9..b885bd5 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -55,6 +55,11 @@
BUILTIN_VS (UNOP, ctz, 2, NONE)
BUILTIN_VB (UNOP, popcount, 2, NONE)
+ /* Implemented by aarch64_get_low<mode>. */
+ BUILTIN_VQMOV (UNOP, get_low, 0, AUTO_FP)
+ /* Implemented by aarch64_get_high<mode>. */
+ BUILTIN_VQMOV (UNOP, get_high, 0, AUTO_FP)
+
/* Implemented by aarch64_<sur>q<r>shl<mode>. */
BUILTIN_VSDQ_I (BINOP, sqshl, 0, NONE)
BUILTIN_VSDQ_I (BINOP_UUS, uqshl, 0, NONE)
@@ -300,6 +305,14 @@
BUILTIN_VD_HSI (BINOP, smull_n, 0, NONE)
BUILTIN_VD_HSI (BINOPU, umull_n, 0, NONE)
+ BUILTIN_VQ_HSI (BINOP, smull_hi_n, 0, NONE)
+ BUILTIN_VQ_HSI (BINOPU, umull_hi_n, 0, NONE)
+
+ BUILTIN_VQ_HSI (TERNOP_LANE, smull_hi_lane, 0, NONE)
+ BUILTIN_VQ_HSI (TERNOP_LANE, smull_hi_laneq, 0, NONE)
+ BUILTIN_VQ_HSI (TERNOPU_LANE, umull_hi_lane, 0, NONE)
+ BUILTIN_VQ_HSI (TERNOPU_LANE, umull_hi_laneq, 0, NONE)
+
BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_lane_, 0, NONE)
BUILTIN_VD_HSI (QUADOP_LANE, vec_smlal_lane_, 0, NONE)
BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_laneq_, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 393bab1..71aa77d 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -297,6 +297,28 @@
"TARGET_SIMD"
)
+(define_expand "aarch64_get_low<mode>"
+ [(match_operand:<VHALF> 0 "register_operand")
+ (match_operand:VQMOV 1 "register_operand")]
+ "TARGET_SIMD"
+ {
+ rtx lo = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
+ emit_insn (gen_aarch64_get_half<mode> (operands[0], operands[1], lo));
+ DONE;
+ }
+)
+
+(define_expand "aarch64_get_high<mode>"
+ [(match_operand:<VHALF> 0 "register_operand")
+ (match_operand:VQMOV 1 "register_operand")]
+ "TARGET_SIMD"
+ {
+ rtx hi = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+ emit_insn (gen_aarch64_get_half<mode> (operands[0], operands[1], hi));
+ DONE;
+ }
+)
+
(define_insn_and_split "aarch64_simd_mov_from_<mode>low"
[(set (match_operand:<VHALF> 0 "register_operand" "=w,?r")
(vec_select:<VHALF>
@@ -2253,6 +2275,70 @@
[(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
)
+(define_insn "aarch64_<su>mull_hi_lane<mode>_insn"
+ [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+ (mult:<VWIDE>
+ (ANY_EXTEND:<VWIDE> (vec_select:<VHALF>
+ (match_operand:VQ_HSI 1 "register_operand" "w")
+ (match_operand:VQ_HSI 2 "vect_par_cnst_hi_half" "")))
+ (ANY_EXTEND:<VWIDE> (vec_duplicate:<VHALF>
+ (vec_select:<VEL>
+ (match_operand:<VCOND> 3 "register_operand" "<vwx>")
+ (parallel [(match_operand:SI 4 "immediate_operand" "i")]))))))]
+ "TARGET_SIMD"
+ {
+ operands[4] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[4]));
+ return "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %3.<Vetype>[%4]";
+ }
+ [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_expand "aarch64_<su>mull_hi_lane<mode>"
+ [(match_operand:<VWIDE> 0 "register_operand")
+ (ANY_EXTEND:<VWIDE>(match_operand:VQ_HSI 1 "register_operand"))
+ (match_operand:<VCOND> 2 "register_operand")
+ (match_operand:SI 3 "immediate_operand")]
+ "TARGET_SIMD"
+{
+ rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+ emit_insn (gen_aarch64_<su>mull_hi_lane<mode>_insn (operands[0],
+ operands[1], p, operands[2], operands[3]));
+ DONE;
+}
+)
+
+(define_insn "aarch64_<su>mull_hi_laneq<mode>_insn"
+ [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+ (mult:<VWIDE>
+ (ANY_EXTEND:<VWIDE> (vec_select:<VHALF>
+ (match_operand:VQ_HSI 1 "register_operand" "w")
+ (match_operand:VQ_HSI 2 "vect_par_cnst_hi_half" "")))
+ (ANY_EXTEND:<VWIDE> (vec_duplicate:<VHALF>
+ (vec_select:<VEL>
+ (match_operand:<VCONQ> 3 "register_operand" "<vwx>")
+ (parallel [(match_operand:SI 4 "immediate_operand" "i")]))))))]
+ "TARGET_SIMD"
+ {
+ operands[4] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[4]));
+ return "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %3.<Vetype>[%4]";
+ }
+ [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_expand "aarch64_<su>mull_hi_laneq<mode>"
+ [(match_operand:<VWIDE> 0 "register_operand")
+ (ANY_EXTEND:<VWIDE>(match_operand:VQ_HSI 1 "register_operand"))
+ (match_operand:<VCONQ> 2 "register_operand")
+ (match_operand:SI 3 "immediate_operand")]
+ "TARGET_SIMD"
+{
+ rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+ emit_insn (gen_aarch64_<su>mull_hi_laneq<mode>_insn (operands[0],
+ operands[1], p, operands[2], operands[3]));
+ DONE;
+}
+)
+
(define_insn "aarch64_<su>mull_n<mode>"
[(set (match_operand:<VWIDE> 0 "register_operand" "=w")
(mult:<VWIDE>
@@ -2266,6 +2352,33 @@
[(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
)
+(define_insn "aarch64_<su>mull_hi_n<mode>_insn"
+ [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+ (mult:<VWIDE>
+ (ANY_EXTEND:<VWIDE> (vec_select:<VHALF>
+ (match_operand:VQ_HSI 1 "register_operand" "w")
+ (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
+ (ANY_EXTEND:<VWIDE>
+ (vec_duplicate:<VCOND>
+ (match_operand:<VEL> 2 "register_operand" "<h_con>")))))]
+ "TARGET_SIMD"
+ "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[0]"
+ [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
+)
+
+(define_expand "aarch64_<su>mull_hi_n<mode>"
+ [(match_operand:<VWIDE> 0 "register_operand")
+ (ANY_EXTEND:<VWIDE> (match_operand:VQ_HSI 1 "register_operand"))
+ (match_operand:<VEL> 2 "register_operand")]
+ "TARGET_SIMD"
+ {
+ rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+ emit_insn (gen_aarch64_<su>mull_hi_n<mode>_insn (operands[0], operands[1],
+ operands[2], p));
+ DONE;
+ }
+)
+
;; vmlal_lane_s16 intrinsics
(define_insn "aarch64_vec_<su>mlal_lane<Qlane>"
[(set (match_operand:<VWIDE> 0 "register_operand" "=w")
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b6192e5..146ed8c 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -11568,7 +11568,6 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
if (VECTOR_MODE_P (mode))
{
unsigned int vec_flags = aarch64_classify_vector_mode (mode);
- mode = GET_MODE_INNER (mode);
if (vec_flags & VEC_ADVSIMD)
{
/* The by-element versions of the instruction have the same costs as
@@ -11582,6 +11581,17 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
else if (GET_CODE (op1) == VEC_DUPLICATE)
op1 = XEXP (op1, 0);
}
+ cost += rtx_cost (op0, mode, MULT, 0, speed);
+ cost += rtx_cost (op1, mode, MULT, 1, speed);
+ if (speed)
+ {
+ if (GET_CODE (x) == MULT)
+ cost += extra_cost->vect.mult;
+ /* This is to catch the SSRA costing currently flowing here. */
+ else
+ cost += extra_cost->vect.alu;
+ }
+ return cost;
}
/* Integer multiply/fma. */
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index d50bd65..baa30bd 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -6302,216 +6302,203 @@ vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, const int __index)
return __aarch64_vset_lane_any (__elem, __vec, __index);
}
-#define __GET_LOW(__TYPE) \
- uint64x2_t tmp = vreinterpretq_u64_##__TYPE (__a); \
- uint64x1_t lo = vcreate_u64 (vgetq_lane_u64 (tmp, 0)); \
- return vreinterpret_##__TYPE##_u64 (lo);
-
__extension__ extern __inline float16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_low_f16 (float16x8_t __a)
{
- __GET_LOW (f16);
+ return __builtin_aarch64_get_lowv8hf (__a);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_low_f32 (float32x4_t __a)
{
- __GET_LOW (f32);
+ return __builtin_aarch64_get_lowv4sf (__a);
}
__extension__ extern __inline float64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_low_f64 (float64x2_t __a)
{
- return (float64x1_t) {vgetq_lane_f64 (__a, 0)};
+ return (float64x1_t) {__builtin_aarch64_get_lowv2df (__a)};
}
__extension__ extern __inline poly8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_low_p8 (poly8x16_t __a)
{
- __GET_LOW (p8);
+ return (poly8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a);
}
__extension__ extern __inline poly16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_low_p16 (poly16x8_t __a)
{
- __GET_LOW (p16);
+ return (poly16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a);
}
__extension__ extern __inline poly64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_low_p64 (poly64x2_t __a)
{
- __GET_LOW (p64);
+ return (poly64x1_t) __builtin_aarch64_get_lowv2di ((int64x2_t) __a);
}
__extension__ extern __inline int8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_low_s8 (int8x16_t __a)
{
- __GET_LOW (s8);
+ return __builtin_aarch64_get_lowv16qi (__a);
}
__extension__ extern __inline int16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_low_s16 (int16x8_t __a)
{
- __GET_LOW (s16);
+ return __builtin_aarch64_get_lowv8hi (__a);
}
__extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_low_s32 (int32x4_t __a)
{
- __GET_LOW (s32);
+ return __builtin_aarch64_get_lowv4si (__a);
}
__extension__ extern __inline int64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_low_s64 (int64x2_t __a)
{
- __GET_LOW (s64);
+ return (int64x1_t) {__builtin_aarch64_get_lowv2di (__a)};
}
__extension__ extern __inline uint8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_low_u8 (uint8x16_t __a)
{
- __GET_LOW (u8);
+ return (uint8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a);
}
__extension__ extern __inline uint16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_low_u16 (uint16x8_t __a)
{
- __GET_LOW (u16);
+ return (uint16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a);
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_low_u32 (uint32x4_t __a)
{
- __GET_LOW (u32);
+ return (uint32x2_t) __builtin_aarch64_get_lowv4si ((int32x4_t) __a);
}
__extension__ extern __inline uint64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_low_u64 (uint64x2_t __a)
{
- return vcreate_u64 (vgetq_lane_u64 (__a, 0));
+ return (uint64x1_t) {__builtin_aarch64_get_lowv2di ((int64x2_t) __a)};
}
-#undef __GET_LOW
-
-#define __GET_HIGH(__TYPE) \
- uint64x2_t tmp = vreinterpretq_u64_##__TYPE (__a); \
- uint64x1_t hi = vcreate_u64 (vgetq_lane_u64 (tmp, 1)); \
- return vreinterpret_##__TYPE##_u64 (hi);
-
__extension__ extern __inline float16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_high_f16 (float16x8_t __a)
{
- __GET_HIGH (f16);
+ return __builtin_aarch64_get_highv8hf (__a);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_high_f32 (float32x4_t __a)
{
- __GET_HIGH (f32);
+ return __builtin_aarch64_get_highv4sf (__a);
}
__extension__ extern __inline float64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_high_f64 (float64x2_t __a)
{
- __GET_HIGH (f64);
+ return (float64x1_t) {__builtin_aarch64_get_highv2df (__a)};
}
__extension__ extern __inline poly8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_high_p8 (poly8x16_t __a)
{
- __GET_HIGH (p8);
+ return (poly8x8_t) __builtin_aarch64_get_highv16qi ((int8x16_t) __a);
}
__extension__ extern __inline poly16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_high_p16 (poly16x8_t __a)
{
- __GET_HIGH (p16);
+ return (poly16x4_t) __builtin_aarch64_get_highv8hi ((int16x8_t) __a);
}
__extension__ extern __inline poly64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_high_p64 (poly64x2_t __a)
{
- __GET_HIGH (p64);
+ return (poly64x1_t) __builtin_aarch64_get_highv2di ((int64x2_t) __a);
}
__extension__ extern __inline int8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_high_s8 (int8x16_t __a)
{
- __GET_HIGH (s8);
+ return __builtin_aarch64_get_highv16qi (__a);
}
__extension__ extern __inline int16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_high_s16 (int16x8_t __a)
{
- __GET_HIGH (s16);
+ return __builtin_aarch64_get_highv8hi (__a);
}
__extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_high_s32 (int32x4_t __a)
{
- __GET_HIGH (s32);
+ return __builtin_aarch64_get_highv4si (__a);
}
__extension__ extern __inline int64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_high_s64 (int64x2_t __a)
{
- __GET_HIGH (s64);
+ return (int64x1_t) {__builtin_aarch64_get_highv2di (__a)};
}
__extension__ extern __inline uint8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_high_u8 (uint8x16_t __a)
{
- __GET_HIGH (u8);
+ return (uint8x8_t) __builtin_aarch64_get_highv16qi ((int8x16_t) __a);
}
__extension__ extern __inline uint16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_high_u16 (uint16x8_t __a)
{
- __GET_HIGH (u16);
+ return (uint16x4_t) __builtin_aarch64_get_highv8hi ((int16x8_t) __a);
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_high_u32 (uint32x4_t __a)
{
- __GET_HIGH (u32);
+ return (uint32x2_t) __builtin_aarch64_get_highv4si ((int32x4_t) __a);
}
-#undef __GET_HIGH
-
__extension__ extern __inline uint64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_high_u64 (uint64x2_t __a)
{
- return vcreate_u64 (vgetq_lane_u64 (__a, 1));
+ return (uint64x1_t) {__builtin_aarch64_get_highv2di ((int64x2_t) __a)};
}
+
__extension__ extern __inline int8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcombine_s8 (int8x8_t __a, int8x8_t __b)
@@ -8167,156 +8154,89 @@ vshrn_n_u64 (uint64x2_t __a, const int __b)
{
return (uint32x2_t)__builtin_aarch64_shrnv2di ((int64x2_t)__a, __b);
}
-#define vmull_high_lane_s16(a, b, c) \
- __extension__ \
- ({ \
- int16x4_t b_ = (b); \
- int16x8_t a_ = (a); \
- int32x4_t result; \
- __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]" \
- : "=w"(result) \
- : "w"(a_), "x"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
-#define vmull_high_lane_s32(a, b, c) \
- __extension__ \
- ({ \
- int32x2_t b_ = (b); \
- int32x4_t a_ = (a); \
- int64x2_t result; \
- __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]" \
- : "=w"(result) \
- : "w"(a_), "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_high_lane_s16 (int16x8_t __a, int16x4_t __v, const int __lane)
+{
+ return __builtin_aarch64_smull_hi_lanev8hi (__a, __v, __lane);
+}
-#define vmull_high_lane_u16(a, b, c) \
- __extension__ \
- ({ \
- uint16x4_t b_ = (b); \
- uint16x8_t a_ = (a); \
- uint32x4_t result; \
- __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]" \
- : "=w"(result) \
- : "w"(a_), "x"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_high_lane_s32 (int32x4_t __a, int32x2_t __v, const int __lane)
+{
+ return __builtin_aarch64_smull_hi_lanev4si (__a, __v, __lane);
+}
-#define vmull_high_lane_u32(a, b, c) \
- __extension__ \
- ({ \
- uint32x2_t b_ = (b); \
- uint32x4_t a_ = (a); \
- uint64x2_t result; \
- __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]" \
- : "=w"(result) \
- : "w"(a_), "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_high_lane_u16 (uint16x8_t __a, uint16x4_t __v, const int __lane)
+{
+ return __builtin_aarch64_umull_hi_lanev8hi_uuus (__a, __v, __lane);
+}
-#define vmull_high_laneq_s16(a, b, c) \
- __extension__ \
- ({ \
- int16x8_t b_ = (b); \
- int16x8_t a_ = (a); \
- int32x4_t result; \
- __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]" \
- : "=w"(result) \
- : "w"(a_), "x"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_high_lane_u32 (uint32x4_t __a, uint32x2_t __v, const int __lane)
+{
+ return __builtin_aarch64_umull_hi_lanev4si_uuus (__a, __v, __lane);
+}
-#define vmull_high_laneq_s32(a, b, c) \
- __extension__ \
- ({ \
- int32x4_t b_ = (b); \
- int32x4_t a_ = (a); \
- int64x2_t result; \
- __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]" \
- : "=w"(result) \
- : "w"(a_), "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_high_laneq_s16 (int16x8_t __a, int16x8_t __v, const int __lane)
+{
+ return __builtin_aarch64_smull_hi_laneqv8hi (__a, __v, __lane);
+}
-#define vmull_high_laneq_u16(a, b, c) \
- __extension__ \
- ({ \
- uint16x8_t b_ = (b); \
- uint16x8_t a_ = (a); \
- uint32x4_t result; \
- __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]" \
- : "=w"(result) \
- : "w"(a_), "x"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_high_laneq_s32 (int32x4_t __a, int32x4_t __v, const int __lane)
+{
+ return __builtin_aarch64_smull_hi_laneqv4si (__a, __v, __lane);
+}
-#define vmull_high_laneq_u32(a, b, c) \
- __extension__ \
- ({ \
- uint32x4_t b_ = (b); \
- uint32x4_t a_ = (a); \
- uint64x2_t result; \
- __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]" \
- : "=w"(result) \
- : "w"(a_), "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_high_laneq_u16 (uint16x8_t __a, uint16x8_t __v, const int __lane)
+{
+ return __builtin_aarch64_umull_hi_laneqv8hi_uuus (__a, __v, __lane);
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_high_laneq_u32 (uint32x4_t __a, uint32x4_t __v, const int __lane)
+{
+ return __builtin_aarch64_umull_hi_laneqv4si_uuus (__a, __v, __lane);
+}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmull_high_n_s16 (int16x8_t __a, int16_t __b)
{
- int32x4_t __result;
- __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]"
- : "=w"(__result)
- : "w"(__a), "x"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_smull_hi_nv8hi (__a, __b);
}
__extension__ extern __inline int64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmull_high_n_s32 (int32x4_t __a, int32_t __b)
{
- int64x2_t __result;
- __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_smull_hi_nv4si (__a, __b);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmull_high_n_u16 (uint16x8_t __a, uint16_t __b)
{
- uint32x4_t __result;
- __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]"
- : "=w"(__result)
- : "w"(__a), "x"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_umull_hi_nv8hi_uuu (__a, __b);
}
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmull_high_n_u32 (uint32x4_t __a, uint32_t __b)
{
- uint64x2_t __result;
- __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_umull_hi_nv4si_uuu (__a, __b);
}
__extension__ extern __inline poly16x8_t
diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h
index 251de3d..7a9cf3d 100644
--- a/gcc/config/arm/aarch-common-protos.h
+++ b/gcc/config/arm/aarch-common-protos.h
@@ -132,6 +132,7 @@ struct fp_cost_table
struct vector_cost_table
{
const int alu;
+ const int mult;
};
struct cpu_cost_table
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index d4baee4..25ff702 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -121,7 +121,8 @@ const struct cpu_cost_table generic_extra_costs =
},
/* Vector */
{
- COSTS_N_INSNS (1) /* alu. */
+ COSTS_N_INSNS (1), /* alu. */
+ COSTS_N_INSNS (4) /* mult. */
}
};
@@ -224,7 +225,8 @@ const struct cpu_cost_table cortexa53_extra_costs =
},
/* Vector */
{
- COSTS_N_INSNS (1) /* alu. */
+ COSTS_N_INSNS (1), /* alu. */
+ COSTS_N_INSNS (4) /* mult. */
}
};
@@ -327,7 +329,8 @@ const struct cpu_cost_table cortexa57_extra_costs =
},
/* Vector */
{
- COSTS_N_INSNS (1) /* alu. */
+ COSTS_N_INSNS (1), /* alu. */
+ COSTS_N_INSNS (4) /* mult. */
}
};
@@ -430,7 +433,8 @@ const struct cpu_cost_table cortexa76_extra_costs =
},
/* Vector */
{
- COSTS_N_INSNS (1) /* alu. */
+ COSTS_N_INSNS (1), /* alu. */
+ COSTS_N_INSNS (4) /* mult. */
}
};
@@ -533,7 +537,8 @@ const struct cpu_cost_table exynosm1_extra_costs =
},
/* Vector */
{
- COSTS_N_INSNS (0) /* alu. */
+ COSTS_N_INSNS (0), /* alu. */
+ COSTS_N_INSNS (4) /* mult. */
}
};
@@ -636,7 +641,8 @@ const struct cpu_cost_table xgene1_extra_costs =
},
/* Vector */
{
- COSTS_N_INSNS (2) /* alu. */
+ COSTS_N_INSNS (2), /* alu. */
+ COSTS_N_INSNS (8) /* mult. */
}
};
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index e22396d..d254f41 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -1192,7 +1192,8 @@ const struct cpu_cost_table cortexa9_extra_costs =
},
/* Vector */
{
- COSTS_N_INSNS (1) /* alu. */
+ COSTS_N_INSNS (1), /* alu. */
+ COSTS_N_INSNS (4) /* mult. */
}
};
@@ -1295,7 +1296,8 @@ const struct cpu_cost_table cortexa8_extra_costs =
},
/* Vector */
{
- COSTS_N_INSNS (1) /* alu. */
+ COSTS_N_INSNS (1), /* alu. */
+ COSTS_N_INSNS (4) /* mult. */
}
};
@@ -1399,7 +1401,8 @@ const struct cpu_cost_table cortexa5_extra_costs =
},
/* Vector */
{
- COSTS_N_INSNS (1) /* alu. */
+ COSTS_N_INSNS (1), /* alu. */
+ COSTS_N_INSNS (4) /* mult. */
}
};
@@ -1504,7 +1507,8 @@ const struct cpu_cost_table cortexa7_extra_costs =
},
/* Vector */
{
- COSTS_N_INSNS (1) /* alu. */
+ COSTS_N_INSNS (1), /* alu. */
+ COSTS_N_INSNS (4) /* mult. */
}
};
@@ -1607,7 +1611,8 @@ const struct cpu_cost_table cortexa12_extra_costs =
},
/* Vector */
{
- COSTS_N_INSNS (1) /* alu. */
+ COSTS_N_INSNS (1), /* alu. */
+ COSTS_N_INSNS (4) /* mult. */
}
};
@@ -1710,7 +1715,8 @@ const struct cpu_cost_table cortexa15_extra_costs =
},
/* Vector */
{
- COSTS_N_INSNS (1) /* alu. */
+ COSTS_N_INSNS (1), /* alu. */
+ COSTS_N_INSNS (4) /* mult. */
}
};
@@ -1813,7 +1819,8 @@ const struct cpu_cost_table v7m_extra_costs =
},
/* Vector */
{
- COSTS_N_INSNS (1) /* alu. */
+ COSTS_N_INSNS (1), /* alu. */
+ COSTS_N_INSNS (4) /* mult. */
}
};
diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md
index bd53bf3..d7fd96c 100644
--- a/gcc/config/arm/thumb2.md
+++ b/gcc/config/arm/thumb2.md
@@ -1711,15 +1711,27 @@
;; Originally expanded by 'doloop_end'.
(define_insn "*doloop_end_internal"
- [(parallel [(set (pc)
- (if_then_else
- (ne (reg:SI LR_REGNUM) (const_int 1))
- (label_ref (match_operand 0 "" ""))
- (pc)))
- (set (reg:SI LR_REGNUM)
- (plus:SI (reg:SI LR_REGNUM) (const_int -1)))])]
+ [(set (pc)
+ (if_then_else
+ (ne (reg:SI LR_REGNUM) (const_int 1))
+ (label_ref (match_operand 0 "" ""))
+ (pc)))
+ (set (reg:SI LR_REGNUM)
+ (plus:SI (reg:SI LR_REGNUM) (const_int -1)))
+ (clobber (reg:CC CC_REGNUM))]
"TARGET_32BIT && TARGET_HAVE_LOB"
- "le\t%|lr, %l0")
+ {
+ if (get_attr_length (insn) == 4)
+ return "le\t%|lr, %l0";
+ else
+ return "subs\t%|lr, #1;bne\t%l0";
+ }
+ [(set (attr "length")
+ (if_then_else
+ (ltu (minus (pc) (match_dup 0)) (const_int 1024))
+ (const_int 4)
+ (const_int 6)))
+ (set_attr "type" "branch")])
(define_expand "doloop_begin"
[(match_operand 0 "" "")
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index d64b4ac..02d3142 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -3469,6 +3469,33 @@ ix86_valid_mask_cmp_mode (machine_mode mode)
return vector_size == 64 || TARGET_AVX512VL;
}
+/* Return true if integer mask comparison should be used. */
+static bool
+ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
+ rtx op_true, rtx op_false)
+{
+ if (GET_MODE_SIZE (mode) == 64)
+ return true;
+
+ /* When op_true is NULL, op_false must be NULL, or vice versa. */
+ gcc_assert (!op_true == !op_false);
+
+ /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
+ vector dest is required. */
+ if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
+ return false;
+
+ /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
+ if (op_false == CONST0_RTX (mode)
+ || op_true == CONST0_RTX (mode)
+ || (INTEGRAL_MODE_P (mode)
+ && (op_true == CONSTM1_RTX (mode)
+ || op_false == CONSTM1_RTX (mode))))
+ return false;
+
+ return true;
+}
+
/* Expand an SSE comparison. Return the register with the result. */
static rtx
@@ -3485,7 +3512,7 @@ ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
bool maskcmp = false;
rtx x;
- if (ix86_valid_mask_cmp_mode (cmp_ops_mode))
+ if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
{
unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
maskcmp = true;
@@ -3517,7 +3544,7 @@ ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
- if (cmp_mode != mode && !maskcmp)
+ if (cmp_mode != mode)
{
x = force_reg (cmp_ops_mode, x);
convert_move (dest, x, false);
@@ -3544,9 +3571,6 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
return;
}
- /* In AVX512F the result of comparison is an integer mask. */
- bool maskcmp = mode != cmpmode && ix86_valid_mask_cmp_mode (mode);
-
rtx t2, t3, x;
/* If we have an integer mask and FP value then we need
@@ -3557,8 +3581,11 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
cmp = gen_rtx_SUBREG (mode, cmp, 0);
}
- if (maskcmp)
+ /* In AVX512F the result of comparison is an integer mask. */
+ if (mode != cmpmode
+ && GET_MODE_CLASS (cmpmode) == MODE_INT)
{
+ gcc_assert (ix86_valid_mask_cmp_mode (mode));
/* Using vector move with mask register. */
cmp = force_reg (cmpmode, cmp);
/* Optimize for mask zero. */
@@ -4016,7 +4043,7 @@ ix86_expand_fp_vec_cmp (rtx operands[])
}
else
cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
- operands[1], operands[2]);
+ NULL, NULL);
if (operands[0] != cmp)
emit_move_insn (operands[0], cmp);
@@ -4041,7 +4068,7 @@ ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
;
/* AVX512F supports all of the comparsions
on all 128/256/512-bit vector int types. */
- else if (ix86_valid_mask_cmp_mode (mode))
+ else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
;
else
{
diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
index a70f6ed..cdeabbf 100644
--- a/gcc/config/i386/i386-options.c
+++ b/gcc/config/i386/i386-options.c
@@ -98,6 +98,8 @@ along with GCC; see the file COPYING3. If not see
#endif
/* Processor feature/optimization bitmasks. */
+#define m_NONE HOST_WIDE_INT_0U
+#define m_ALL (~HOST_WIDE_INT_0U)
#define m_386 (HOST_WIDE_INT_1U<<PROCESSOR_I386)
#define m_486 (HOST_WIDE_INT_1U<<PROCESSOR_I486)
#define m_PENT (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUM)
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 369a00d..db5be59 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -6356,6 +6356,7 @@
(match_operand:V2SF 1 "register_operand")))]
"TARGET_AVX512DQ && TARGET_AVX512VL"
{
+ operands[1] = force_reg (V2SFmode, operands[1]);
operands[1] = simplify_gen_subreg (V4SFmode, operands[1], V2SFmode, 0);
emit_insn (gen_avx512dq_fix<fixunssuffix>_truncv2sfv2di2
(operands[0], operands[1]));
@@ -18013,6 +18014,7 @@
{
if (!MEM_P (operands[1]))
{
+ operands[1] = force_reg (V8QImode, operands[1]);
operands[1] = simplify_gen_subreg (V16QImode, operands[1], V8QImode, 0);
emit_insn (gen_sse4_1_<code>v8qiv8hi2 (operands[0], operands[1]));
DONE;
@@ -18090,6 +18092,7 @@
{
if (!MEM_P (operands[1]))
{
+ operands[1] = force_reg (V8QImode, operands[1]);
operands[1] = simplify_gen_subreg (V16QImode, operands[1], V8QImode, 0);
emit_insn (gen_avx2_<code>v8qiv8si2 (operands[0], operands[1]));
DONE;
@@ -18153,6 +18156,7 @@
{
if (!MEM_P (operands[1]))
{
+ operands[1] = force_reg (V4QImode, operands[1]);
operands[1] = simplify_gen_subreg (V16QImode, operands[1], V4QImode, 0);
emit_insn (gen_sse4_1_<code>v4qiv4si2 (operands[0], operands[1]));
DONE;
@@ -18279,6 +18283,7 @@
{
if (!MEM_P (operands[1]))
{
+ operands[1] = force_reg (V4HImode, operands[1]);
operands[1] = simplify_gen_subreg (V8HImode, operands[1], V4HImode, 0);
emit_insn (gen_sse4_1_<code>v4hiv4si2 (operands[0], operands[1]));
DONE;
@@ -18366,6 +18371,7 @@
{
if (!MEM_P (operands[1]))
{
+ operands[1] = force_reg (V8QImode, operands[1]);
operands[1] = simplify_gen_subreg (V16QImode, operands[1], V8QImode, 0);
emit_insn (gen_avx512f_<code>v8qiv8di2 (operands[0], operands[1]));
DONE;
@@ -18427,6 +18433,7 @@
{
if (!MEM_P (operands[1]))
{
+ operands[1] = force_reg (V8QImode, operands[1]);
operands[1] = simplify_gen_subreg (V16QImode, operands[1], V8QImode, 0);
emit_insn (gen_avx2_<code>v4qiv4di2 (operands[0], operands[1]));
DONE;
@@ -18453,6 +18460,7 @@
(match_operand:V2QI 1 "register_operand")))]
"TARGET_SSE4_1"
{
+ operands[1] = force_reg (V2QImode, operands[1]);
operands[1] = simplify_gen_subreg (V16QImode, operands[1], V2QImode, 0);
emit_insn (gen_sse4_1_<code>v2qiv2di2 (operands[0], operands[1]));
DONE;
@@ -18525,6 +18533,7 @@
{
if (!MEM_P (operands[1]))
{
+ operands[1] = force_reg (V4HImode, operands[1]);
operands[1] = simplify_gen_subreg (V8HImode, operands[1], V4HImode, 0);
emit_insn (gen_avx2_<code>v4hiv4di2 (operands[0], operands[1]));
DONE;
@@ -18586,6 +18595,7 @@
{
if (!MEM_P (operands[1]))
{
+ operands[1] = force_reg (V2HImode, operands[1]);
operands[1] = simplify_gen_subreg (V8HImode, operands[1], V2HImode, 0);
emit_insn (gen_sse4_1_<code>v2hiv2di2 (operands[0], operands[1]));
DONE;
@@ -18737,6 +18747,7 @@
{
if (!MEM_P (operands[1]))
{
+ operands[1] = force_reg (V2SImode, operands[1]);
operands[1] = simplify_gen_subreg (V4SImode, operands[1], V2SImode, 0);
emit_insn (gen_sse4_1_<code>v2siv2di2 (operands[0], operands[1]));
DONE;
diff --git a/gcc/config/i386/winnt.c b/gcc/config/i386/winnt.c
index 962c88e..adc3f36 100644
--- a/gcc/config/i386/winnt.c
+++ b/gcc/config/i386/winnt.c
@@ -1231,6 +1231,10 @@ i386_pe_seh_unwind_emit (FILE *asm_out_file, rtx_insn *insn)
seh = cfun->machine->seh;
if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_SWITCH_TEXT_SECTIONS)
{
+ /* See ix86_seh_fixup_eh_fallthru for the rationale. */
+ rtx_insn *prev = prev_active_insn (insn);
+ if (prev && !insn_nothrow_p (prev))
+ fputs ("\tnop\n", asm_out_file);
fputs ("\t.seh_endproc\n", asm_out_file);
seh->in_cold_section = true;
return;
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 7ace8da..140ccb3 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -453,12 +453,12 @@ DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVE
/* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if false, unaligned loads are
split. */
DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal",
- ~(m_NEHALEM | m_SANDYBRIDGE | m_GENERIC))
+ ~(m_NEHALEM | m_SANDYBRIDGE))
/* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if false, unaligned stores are
split. */
DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal",
- ~(m_NEHALEM | m_SANDYBRIDGE | m_BDVER | m_ZNVER1 | m_GENERIC))
+ ~(m_NEHALEM | m_SANDYBRIDGE | m_BDVER | m_ZNVER1))
/* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX256 ops are split into two AVX128 ops. */
DEF_TUNE (X86_TUNE_AVX256_SPLIT_REGS, "avx256_split_regs",m_BDVER | m_BTVER2
@@ -580,15 +580,15 @@ DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
on simulation result. But after P4 was made, no performance benefit
was observed with branch hints. It also increases the code size.
As a result, icc never generates branch hints. */
-DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", 0U)
+DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", m_NONE)
/* X86_TUNE_QIMODE_MATH: Enable use of 8bit arithmetic. */
-DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", ~0U)
+DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", m_ALL)
/* X86_TUNE_PROMOTE_QI_REGS: This enables generic code that promotes all 8bit
arithmetic to 32bit via PROMOTE_MODE macro. This code generation scheme
is usually used for RISC targets. */
-DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", 0U)
+DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", m_NONE)
/* X86_TUNE_EMIT_VZEROUPPER: This enables vzeroupper instruction insertion
before a transfer of control flow out of the function. */
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index f08b679..794c5a6 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -74,6 +74,7 @@
#include "cfgloop.h"
#include "fold-const.h"
#include "intl.h"
+#include "opts.h"
/* This file should be included last. */
#include "target-def.h"
@@ -219,7 +220,10 @@ nvptx_option_override (void)
flag_no_common = 1;
/* The patch area requires nops, which we don't have. */
- if (function_entry_patch_area_size > 0)
+ HOST_WIDE_INT patch_area_size, patch_area_entry;
+ parse_and_check_patch_area (flag_patchable_function_entry, false,
+ &patch_area_size, &patch_area_entry);
+ if (patch_area_size > 0)
sorry ("not generating patch area, nops not supported");
/* Assumes that it will see only hard registers. */
diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index 76328ec..bd26c62 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -1156,7 +1156,9 @@
;; Return 1 if this operand is valid for a MMA assemble accumulator insn.
(define_special_predicate "mma_assemble_input_operand"
(match_test "(mode == V16QImode
- && (vsx_register_operand (op, mode) || MEM_P (op)))"))
+ && (vsx_register_operand (op, mode)
+ || (MEM_P (op)
+ && quad_address_p (XEXP (op, 0), mode, false))))"))
;; Return 1 if this operand is valid for an MMA disassemble insn.
(define_predicate "mma_disassemble_output_operand"