aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhoebe Wang <phoebe.wang@intel.com>2024-03-04 18:09:41 +0800
committerTom Stellard <tstellar@redhat.com>2024-03-19 14:06:42 -0700
commit26a1d6601d727a96f4301d0d8647b5a42760ae0c (patch)
tree232815b14ea830e8ab92f111fdb4844e7e5a3f76
parent0bf7ff1028fb7cc81324e9c38c585e6533b754e4 (diff)
downloadllvm-26a1d6601d727a96f4301d0d8647b5a42760ae0c.zip
llvm-26a1d6601d727a96f4301d0d8647b5a42760ae0c.tar.gz
llvm-26a1d6601d727a96f4301d0d8647b5a42760ae0c.tar.bz2
[X86] Add missing subvector_subreg_lowering for BF16 (#83720)llvmorg-18.1.2
-rw-r--r--llvm/lib/Target/X86/X86InstrVecCompiler.td3
-rw-r--r--llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll22
-rw-r--r--llvm/test/CodeGen/X86/bfloat.ll1
3 files changed, 25 insertions, 1 deletions
diff --git a/llvm/lib/Target/X86/X86InstrVecCompiler.td b/llvm/lib/Target/X86/X86InstrVecCompiler.td
index bbd19cf..461b2ba 100644
--- a/llvm/lib/Target/X86/X86InstrVecCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrVecCompiler.td
@@ -83,6 +83,7 @@ defm : subvector_subreg_lowering<VR128, v2f64, VR256, v4f64, sub_xmm>;
defm : subvector_subreg_lowering<VR128, v8i16, VR256, v16i16, sub_xmm>;
defm : subvector_subreg_lowering<VR128, v16i8, VR256, v32i8, sub_xmm>;
defm : subvector_subreg_lowering<VR128, v8f16, VR256, v16f16, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v8bf16, VR256, v16bf16, sub_xmm>;
// A 128-bit subvector extract from the first 512-bit vector position is a
// subregister copy that needs no instruction. Likewise, a 128-bit subvector
@@ -95,6 +96,7 @@ defm : subvector_subreg_lowering<VR128, v2f64, VR512, v8f64, sub_xmm>;
defm : subvector_subreg_lowering<VR128, v8i16, VR512, v32i16, sub_xmm>;
defm : subvector_subreg_lowering<VR128, v16i8, VR512, v64i8, sub_xmm>;
defm : subvector_subreg_lowering<VR128, v8f16, VR512, v32f16, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v8bf16, VR512, v32bf16, sub_xmm>;
// A 128-bit subvector extract from the first 512-bit vector position is a
// subregister copy that needs no instruction. Likewise, a 128-bit subvector
@@ -107,6 +109,7 @@ defm : subvector_subreg_lowering<VR256, v4f64, VR512, v8f64, sub_ymm>;
defm : subvector_subreg_lowering<VR256, v16i16, VR512, v32i16, sub_ymm>;
defm : subvector_subreg_lowering<VR256, v32i8, VR512, v64i8, sub_ymm>;
defm : subvector_subreg_lowering<VR256, v16f16, VR512, v32f16, sub_ymm>;
+defm : subvector_subreg_lowering<VR256, v16bf16, VR512, v32bf16, sub_ymm>;
// If we're inserting into an all zeros vector, just use a plain move which
diff --git a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
index 0826faa..482713e12 100644
--- a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
@@ -381,3 +381,25 @@ entry:
%1 = shufflevector <8 x bfloat> %0, <8 x bfloat> undef, <16 x i32> zeroinitializer
ret <16 x bfloat> %1
}
+
+define <16 x i32> @pr83358() {
+; X86-LABEL: pr83358:
+; X86: # %bb.0:
+; X86-NEXT: vcvtneps2bf16y {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0x05,A,A,A,A]
+; X86-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-NEXT: vshufi64x2 $0, %zmm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x43,0xc0,0x00]
+; X86-NEXT: # zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: pr83358:
+; X64: # %bb.0:
+; X64-NEXT: vcvtneps2bf16y {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0x05,A,A,A,A]
+; X64-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-NEXT: vshufi64x2 $0, %zmm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x43,0xc0,0x00]
+; X64-NEXT: # zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>)
+ %2 = bitcast <8 x bfloat> %1 to <4 x i32>
+ %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <16 x i32> %3
+}
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index f2d3c4f..cd1dba1 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -2423,7 +2423,6 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
; AVXNC-LABEL: fptrunc_v16f32:
; AVXNC: # %bb.0:
; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0
-; AVXNC-NEXT: vinsertf128 $0, %xmm0, %ymm0, %ymm0
; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm1, %xmm1
; AVXNC-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVXNC-NEXT: retq