diff options
author | Dennis Zhang <dennis.zhang@arm.com> | 2020-03-03 13:13:31 +0000 |
---|---|---|
committer | Dennis Zhang <dennis.zhang@arm.com> | 2020-03-03 13:13:31 +0000 |
commit | 8e6d0dba166324f4b257329bd4b4ddc2b4522359 (patch) | |
tree | 128f011c82f3d7c24afbeb977c3caddc475d5cfc /gcc/config/arm/neon.md | |
parent | 9b4f00dd3f799337d8b8ef5e79f5a682c8059ab9 (diff) | |
download | gcc-8e6d0dba166324f4b257329bd4b4ddc2b4522359.zip gcc-8e6d0dba166324f4b257329bd4b4ddc2b4522359.tar.gz gcc-8e6d0dba166324f4b257329bd4b4ddc2b4522359.tar.bz2 |
arm: ACLE BFloat16 convert intrinsics
This patch is part of a series adding support for Armv8.6-A features.
It implements intrinsics to convert between bfloat16 and float32
formats.
gcc/ChangeLog:
* config/arm/arm_bf16.h (vcvtah_f32_bf16, vcvth_bf16_f32): New.
* config/arm/arm_neon.h (vcvt_f32_bf16, vcvtq_low_f32_bf16): New.
(vcvtq_high_f32_bf16, vcvt_bf16_f32): New.
(vcvtq_low_bf16_f32, vcvtq_high_bf16_f32): New.
* config/arm/arm_neon_builtins.def (vbfcvt, vbfcvt_high): New entries.
(vbfcvtv4sf, vbfcvtv4sf_high): Likewise.
* config/arm/iterators.md (VBFCVT, VBFCVTM): New mode iterators.
(V_bf_low, V_bf_cvt_m): New mode attributes.
* config/arm/neon.md (neon_vbfcvtv4sf<VBFCVT:mode>): New.
(neon_vbfcvtv4sf_highv8bf, neon_vbfcvtsf): New.
(neon_vbfcvt<VBFCVT:mode>, neon_vbfcvt_highv8bf): New.
(neon_vbfcvtbf_cvtmode<mode>, neon_vbfcvtbf): New
* config/arm/unspecs.md (UNSPEC_BFCVT, UNSPEC_BFCVT_HIG): New.
gcc/testsuite/ChangeLog:
* gcc.target/arm/simd/bf16_cvt_1.c: New test.
Diffstat (limited to 'gcc/config/arm/neon.md')
-rw-r--r-- | gcc/config/arm/neon.md | 77 |
1 files changed, 77 insertions, 0 deletions
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index fae8213..f5286d9 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -6660,3 +6660,80 @@ if (BYTES_BIG_ENDIAN) } [(set_attr "type" "neon_dot<q>")] ) + +(define_insn "neon_vbfcvtv4sf<VBFCVT:mode>" + [(set (match_operand:VBFCVT 0 "register_operand" "=w") + (unspec:VBFCVT [(match_operand:V4SF 1 "register_operand" "w")] + UNSPEC_BFCVT))] + "TARGET_BF16_SIMD" + "vcvt.bf16.f32\\t%<V_bf_low>0, %q1" + [(set_attr "type" "neon_fp_cvt_narrow_s_q")] +) + +(define_insn "neon_vbfcvtv4sf_highv8bf" + [(set (match_operand:V8BF 0 "register_operand" "=w") + (unspec:V8BF [(match_operand:V8BF 1 "register_operand" "0") + (match_operand:V4SF 2 "register_operand" "w")] + UNSPEC_BFCVT_HIGH))] + "TARGET_BF16_SIMD" + "vcvt.bf16.f32\\t%f0, %q2" + [(set_attr "type" "neon_fp_cvt_narrow_s_q")] +) + +(define_insn "neon_vbfcvtsf" + [(set (match_operand:BF 0 "register_operand" "=t") + (unspec:BF [(match_operand:SF 1 "register_operand" "t")] + UNSPEC_BFCVT))] + "TARGET_BF16_FP" + "vcvtb.bf16.f32\\t%0, %1" + [(set_attr "type" "f_cvt")] +) + +(define_insn "neon_vbfcvt<VBFCVT:mode>" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (unspec:V4SF [(match_operand:VBFCVT 1 "register_operand" "w")] + UNSPEC_BFCVT))] + "TARGET_BF16_SIMD" + "vshll.u32\\t%q0, %<V_bf_low>1, #16" + [(set_attr "type" "neon_shift_imm_q")] +) + +(define_insn "neon_vbfcvt_highv8bf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (unspec:V4SF [(match_operand:V8BF 1 "register_operand" "w")] + UNSPEC_BFCVT_HIGH))] + "TARGET_BF16_SIMD" + "vshll.u32\\t%q0, %f1, #16" + [(set_attr "type" "neon_shift_imm_q")] +) + +;; Convert a BF scalar operand to SF via VSHL. +;; VSHL doesn't accept 32-bit registers where the BF and SF scalar operands +;; would be allocated, therefore the operands must be converted to intermediate +;; vectors (i.e. V2SI) in order to apply 64-bit registers. +(define_expand "neon_vbfcvtbf" + [(match_operand:SF 0 "register_operand") + (unspec:SF [(match_operand:BF 1 "register_operand")] UNSPEC_BFCVT)] + "TARGET_BF16_FP" +{ + rtx op0 = gen_reg_rtx (V2SImode); + rtx op1 = gen_reg_rtx (V2SImode); + emit_insn (gen_neon_vbfcvtbf_cvtmodev2si (op1, operands[1])); + emit_insn (gen_neon_vshl_nv2si (op0, op1, gen_int_mode(16, SImode))); + emit_insn (gen_neon_vbfcvtbf_cvtmodesf (operands[0], op0)); + DONE; +}) + +;; Convert BF mode to V2SI and V2SI to SF. +;; Implement this by allocating a 32-bit operand in the low half of a 64-bit +;; register indexed by a 32-bit sub-register number. +;; This will generate reloads but compiler can optimize out the moves. +;; Use 'x' constraint to guarantee the 32-bit sub-registers in an indexable +;; range so that to avoid extra moves. +(define_insn "neon_vbfcvtbf_cvtmode<mode>" + [(set (match_operand:VBFCVTM 0 "register_operand" "=x") + (unspec:VBFCVTM [(match_operand:<V_bf_cvt_m> 1 "register_operand" "0")] + UNSPEC_BFCVT))] + "TARGET_BF16_FP" + "" +) |