diff options
-rw-r--r-- | gcc/config/aarch64/aarch64-simd.md | 57 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64.cc | 38 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c | 61 |
3 files changed, 156 insertions, 0 deletions
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 5386043..104088f 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4867,6 +4867,63 @@ } ) +;; div optimizations using narrowings +;; we can do the division e.g. shorts by 255 faster by calculating it as +;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in +;; double the precision of x. +;; +;; If we imagine a short as being composed of two blocks of bytes then +;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalent to +;; adding 1 to each sub component: +;; +;; short value of 16-bits +;; ┌──────────────┬────────────────┐ +;; │ │ │ +;; └──────────────┴────────────────┘ +;; 8-bit part1 ▲ 8-bit part2 ▲ +;; │ │ +;; │ │ +;; +1 +1 +;; +;; after the first addition, we have to shift right by 8, and narrow the +;; results back to a byte. Remember that the addition must be done in +;; double the precision of the input. Since 8 is half the size of a short +;; we can use a narrowing halfing instruction in AArch64, addhn which also +;; does the addition in a wider precision and narrows back to a byte. The +;; shift itself is implicit in the operation as it writes back only the top +;; half of the result. i.e. bits 2*esize-1:esize. +;; +;; Since we have narrowed the result of the first part back to a byte, for +;; the second addition we can use a widening addition, uaddw. +;; +;; For the final shift, since it's unsigned arithmetic we emit an ushr by 8. +;; +;; The shift is later optimized by combine to a uzp2 with movi #0. +(define_expand "@aarch64_bitmask_udiv<mode>3" + [(match_operand:VQN 0 "register_operand") + (match_operand:VQN 1 "register_operand") + (match_operand:VQN 2 "immediate_operand")] + "TARGET_SIMD" +{ + unsigned HOST_WIDE_INT size + = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1; + rtx elt = unwrap_const_vec_duplicate (operands[2]); + if (!CONST_INT_P (elt) || UINTVAL (elt) != size) + FAIL; + + rtx addend = gen_reg_rtx (<MODE>mode); + rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1); + emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROWQ2>mode)); + rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode); + rtx tmp2 = gen_reg_rtx (<MODE>mode); + emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend)); + unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode); + rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, bitsize); + emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1)); + emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, shift_vector)); + DONE; +}) + ;; pmul. (define_insn "aarch64_pmul<mode>" diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index a7f7c3c..c91df6f 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -24306,6 +24306,40 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, return ret; } +/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST. */ + +bool +aarch64_vectorize_can_special_div_by_constant (enum tree_code code, + tree vectype, wide_int cst, + rtx *output, rtx in0, rtx in1) +{ + if (code != TRUNC_DIV_EXPR + || !TYPE_UNSIGNED (vectype)) + return false; + + unsigned int flags = aarch64_classify_vector_mode (TYPE_MODE (vectype)); + if ((flags & VEC_ANY_SVE) && !TARGET_SVE2) + return false; + + if (in0 == NULL_RTX && in1 == NULL_RTX) + { + wide_int val = wi::add (cst, 1); + int pow = wi::exact_log2 (val); + return pow == (int)(element_precision (vectype) / 2); + } + + if (!VECTOR_TYPE_P (vectype)) + return false; + + gcc_assert (output); + + if (!*output) + *output = gen_reg_rtx (TYPE_MODE (vectype)); + + emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output, in0, in1)); + return true; +} + /* Generate a byte permute mask for a register of mode MODE, which has NUNITS units. */ @@ -27796,6 +27830,10 @@ aarch64_libgcc_floating_mode_supported_p #undef TARGET_VECTOR_ALIGNMENT #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment +#undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST +#define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \ + aarch64_vectorize_can_special_div_by_constant + #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \ aarch64_vectorize_preferred_vector_alignment diff --git a/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c new file mode 100644 index 0000000..2a53579 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c @@ -0,0 +1,61 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -std=c99" } */ +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ + +#include <stdint.h> + +#pragma GCC target "+nosve" + +/* +** draw_bitmap1: +** ... +** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h +** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h +** uaddw v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b +** uaddw v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b +** uzp2 v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b +** ... +*/ +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) +{ + for (int i = 0; i < (n & -16); i+=1) + pixel[i] = (pixel[i] * level) / 0xff; +} + +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n) +{ + for (int i = 0; i < (n & -16); i+=1) + pixel[i] = (pixel[i] * level) / 0xfe; +} + +/* +** draw_bitmap3: +** ... +** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s +** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s +** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h +** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h +** uzp2 v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h +** ... +*/ +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n) +{ + for (int i = 0; i < (n & -16); i+=1) + pixel[i] = (pixel[i] * level) / 0xffffU; +} + +/* +** draw_bitmap4: +** ... +** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d +** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d +** uaddw v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s +** uaddw v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s +** uzp2 v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s +** ... +*/ +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n) +{ + for (int i = 0; i < (n & -16); i+=1) + pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; +} |