diff options
| author | Vang Thao <vang.thao@amd.com> | 2026-01-29 19:46:43 -0800 |
|---|---|---|
| committer | Vang Thao <vang.thao@amd.com> | 2026-01-29 19:46:43 -0800 |
| commit | 44e081162afda063dfe1690d1c51a8fccdf8aaae (patch) | |
| tree | 5cea9c75223f0497f415c5c9c884c9f190161217 | |
| parent | a00f5a6d633885c6a59db25e6d3d5663589f5079 (diff) | |
| download | llvm-users/vangthao95/globalisel-class-patch-2.zip llvm-users/vangthao95/globalisel-class-patch-2.tar.gz llvm-users/vangthao95/globalisel-class-patch-2.tar.bz2 | |
[AMDGPU][GlobalISel] Add RegBankLegalize rules for amdgcn.classusers/vangthao95/globalisel-class-patch-2
4 files changed, 267 insertions, 107 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 1eaec8f..acbdc6d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -1159,6 +1159,14 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForIOpcs({amdgcn_s_getpc}).Any({{UniS64, _}, {{Sgpr64}, {None}}}); + addRulesForIOpcs({amdgcn_class}) + .Any({{UniS1, _, S16}, {{UniInVcc}, {IntrId, Vgpr16, Vgpr32}}}) + .Any({{DivS1, _, S16}, {{Vcc}, {IntrId, Vgpr16, Vgpr32}}}) + .Any({{UniS1, _, S32}, {{UniInVcc}, {IntrId, Vgpr32, Vgpr32}}}) + .Any({{DivS1, _, S32}, {{Vcc}, {IntrId, Vgpr32, Vgpr32}}}) + .Any({{UniS1, _, S64}, {{UniInVcc}, {IntrId, Vgpr64, Vgpr32}}}) + .Any({{DivS1, _, S64}, {{Vcc}, {IntrId, Vgpr64, Vgpr32}}}); + // This is "intrinsic lane mask" it was set to i32/i64 in llvm-ir. addRulesForIOpcs({amdgcn_end_cf}) .Any({{_, UniS32}, {{}, {IntrId, Sgpr32}}}) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.class.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.class.mir index 95e63c7..66db698 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.class.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.class.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass='amdgpu-regbankselect,amdgpu-regbanklegalize' %s -o - | FileCheck %s --- name: class_ss diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll index 6f67015..a0f1d41 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc -global-isel=0 -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI-SDAG %s -; RUN: llc -global-isel=1 -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI-GISEL %s declare half @llvm.fabs.f16(half %a) declare i1 @llvm.amdgcn.class.f16(half %a, i32 %b) @@ -39,11 +39,18 @@ define amdgpu_kernel void @class_f16( ; VI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; VI-GISEL-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; VI-GISEL-NEXT: s_load_dword s2, s[8:9], 0x0 -; VI-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_class_f16_e64 s[2:3], v0, s2 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; VI-GISEL-NEXT: v_cmp_class_f16_e32 vcc, s2, v1 +; VI-GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 +; VI-GISEL-NEXT: s_and_b32 s2, s2, 1 +; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; VI-GISEL-NEXT: s_cselect_b32 s2, -1, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] -; VI-GISEL-NEXT: s_nop 2 ; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-GISEL-NEXT: s_endpgm ptr addrspace(1) %r, @@ -75,14 +82,19 @@ define amdgpu_kernel void @class_f16_fabs( ; ; VI-GISEL-LABEL: class_f16_fabs: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dword s3, s[8:9], 0x28 -; VI-GISEL-NEXT: s_load_dword s4, s[8:9], 0x4c +; VI-GISEL-NEXT: s_load_dword s3, s[8:9], 0x4c +; VI-GISEL-NEXT: s_load_dword s4, s[8:9], 0x28 ; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s2, -1 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 -; VI-GISEL-NEXT: v_cmp_class_f16_e64 s[4:5], |v0|, s4 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; VI-GISEL-NEXT: v_cmp_class_f16_e64 s[4:5], |s4|, v0 +; VI-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; VI-GISEL-NEXT: s_cselect_b32 s3, 1, 0 +; VI-GISEL-NEXT: s_and_b32 s3, s3, 1 +; VI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; VI-GISEL-NEXT: s_cselect_b32 s3, -1, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-GISEL-NEXT: s_endpgm @@ -123,7 +135,12 @@ define amdgpu_kernel void @class_f16_fneg( ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_max_f16_e64 v0, -s3, -s3 ; VI-GISEL-NEXT: v_cmp_class_f16_e64 s[4:5], v0, s4 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; VI-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; VI-GISEL-NEXT: s_cselect_b32 s3, 1, 0 +; VI-GISEL-NEXT: s_and_b32 s3, s3, 1 +; VI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; VI-GISEL-NEXT: s_cselect_b32 s3, -1, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-GISEL-NEXT: s_endpgm @@ -164,7 +181,12 @@ define amdgpu_kernel void @class_f16_fabs_fneg( ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_max_f16_e64 v0, -|s3|, -|s3| ; VI-GISEL-NEXT: v_cmp_class_f16_e64 s[4:5], v0, s4 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; VI-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; VI-GISEL-NEXT: s_cselect_b32 s3, 1, 0 +; VI-GISEL-NEXT: s_and_b32 s3, s3, 1 +; VI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; VI-GISEL-NEXT: s_cselect_b32 s3, -1, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-GISEL-NEXT: s_endpgm @@ -202,7 +224,12 @@ define amdgpu_kernel void @class_f16_1( ; VI-GISEL-NEXT: s_mov_b32 s2, -1 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_class_f16_e64 s[4:5], s3, 1 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; VI-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; VI-GISEL-NEXT: s_cselect_b32 s3, 1, 0 +; VI-GISEL-NEXT: s_and_b32 s3, s3, 1 +; VI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; VI-GISEL-NEXT: s_cselect_b32 s3, -1, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-GISEL-NEXT: s_endpgm @@ -235,7 +262,12 @@ define amdgpu_kernel void @class_f16_64( ; VI-GISEL-NEXT: s_mov_b32 s2, -1 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_class_f16_e64 s[4:5], s3, 64 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; VI-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; VI-GISEL-NEXT: s_cselect_b32 s3, 1, 0 +; VI-GISEL-NEXT: s_and_b32 s3, s3, 1 +; VI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; VI-GISEL-NEXT: s_cselect_b32 s3, -1, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-GISEL-NEXT: s_endpgm @@ -270,7 +302,12 @@ define amdgpu_kernel void @class_f16_full_mask( ; VI-GISEL-NEXT: s_mov_b32 s2, -1 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_class_f16_e32 vcc, s3, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; VI-GISEL-NEXT: s_cselect_b32 s3, 1, 0 +; VI-GISEL-NEXT: s_and_b32 s3, s3, 1 +; VI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; VI-GISEL-NEXT: s_cselect_b32 s3, -1, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-GISEL-NEXT: s_endpgm @@ -305,7 +342,12 @@ define amdgpu_kernel void @class_f16_nine_bit_mask( ; VI-GISEL-NEXT: s_mov_b32 s2, -1 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_class_f16_e32 vcc, s3, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-GISEL-NEXT: s_cmp_lg_u64 vcc, 0 +; VI-GISEL-NEXT: s_cselect_b32 s3, 1, 0 +; VI-GISEL-NEXT: s_and_b32 s3, s3, 1 +; VI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; VI-GISEL-NEXT: s_cselect_b32 s3, -1, 0 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll index 0a5522a..865b2a5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -global-isel=0 -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-SDAG %s -; RUN: llc -global-isel=1 -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-GISEL %s declare i1 @llvm.amdgcn.class.f32(float, i32) #1 declare i1 @llvm.amdgcn.class.f64(double, i32) #1 @@ -25,15 +25,21 @@ define amdgpu_kernel void @test_class_f32(ptr addrspace(1) %out, [8 x i32], floa ; ; SI-GISEL-LABEL: test_class_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s3, s[4:5], 0x1c -; SI-GISEL-NEXT: s_load_dword s6, s[4:5], 0x13 +; SI-GISEL-NEXT: s_load_dword s0, s[4:5], 0x1c +; SI-GISEL-NEXT: s_load_dword s1, s[4:5], 0x13 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; SI-GISEL-NEXT: v_cmp_class_f32_e32 vcc, s1, v0 +; SI-GISEL-NEXT: s_or_b64 s[0:1], vcc, vcc +; SI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-GISEL-NEXT: s_and_b32 s3, s2, 1 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, s3 -; SI-GISEL-NEXT: v_cmp_class_f32_e32 vcc, s6, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; SI-GISEL-NEXT: s_cselect_b32 s4, -1, 0 ; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1 @@ -59,15 +65,21 @@ define amdgpu_kernel void @test_class_fabs_f32(ptr addrspace(1) %out, [8 x i32], ; ; SI-GISEL-LABEL: test_class_fabs_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s3, s[4:5], 0x13 -; SI-GISEL-NEXT: s_load_dword s6, s[4:5], 0x1c +; SI-GISEL-NEXT: s_load_dword s0, s[4:5], 0x1c +; SI-GISEL-NEXT: s_load_dword s1, s[4:5], 0x13 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; SI-GISEL-NEXT: v_cmp_class_f32_e64 s[0:1], |s1|, v0 +; SI-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1] +; SI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-GISEL-NEXT: s_and_b32 s3, s2, 1 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, s3 -; SI-GISEL-NEXT: v_cmp_class_f32_e64 s[4:5], |v0|, s6 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; SI-GISEL-NEXT: s_cselect_b32 s4, -1, 0 ; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm %a.fabs = call float @llvm.fabs.f32(float %a) #1 @@ -94,15 +106,21 @@ define amdgpu_kernel void @test_class_fneg_f32(ptr addrspace(1) %out, [8 x i32], ; ; SI-GISEL-LABEL: test_class_fneg_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s3, s[4:5], 0x13 -; SI-GISEL-NEXT: s_load_dword s6, s[4:5], 0x1c +; SI-GISEL-NEXT: s_load_dword s0, s[4:5], 0x13 +; SI-GISEL-NEXT: s_load_dword s1, s[4:5], 0x1c +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e64 v0, -1.0, s0 +; SI-GISEL-NEXT: v_cmp_class_f32_e64 s[0:1], v0, s1 +; SI-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1] +; SI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-GISEL-NEXT: s_and_b32 s3, s2, 1 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mul_f32_e64 v0, 1.0, -s3 -; SI-GISEL-NEXT: v_cmp_class_f32_e64 s[4:5], v0, s6 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; SI-GISEL-NEXT: s_cselect_b32 s4, -1, 0 ; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm %a.fneg = fsub float -0.0, %a @@ -129,15 +147,21 @@ define amdgpu_kernel void @test_class_fneg_fabs_f32(ptr addrspace(1) %out, [8 x ; ; SI-GISEL-LABEL: test_class_fneg_fabs_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s3, s[4:5], 0x13 -; SI-GISEL-NEXT: s_load_dword s6, s[4:5], 0x1c +; SI-GISEL-NEXT: s_load_dword s0, s[4:5], 0x13 +; SI-GISEL-NEXT: s_load_dword s1, s[4:5], 0x1c +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e64 v0, -1.0, |s0| +; SI-GISEL-NEXT: v_cmp_class_f32_e64 s[0:1], v0, s1 +; SI-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1] +; SI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-GISEL-NEXT: s_and_b32 s3, s2, 1 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mul_f32_e64 v0, 1.0, -|s3| -; SI-GISEL-NEXT: v_cmp_class_f32_e64 s[4:5], v0, s6 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; SI-GISEL-NEXT: s_cselect_b32 s4, -1, 0 ; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm %a.fabs = call float @llvm.fabs.f32(float %a) #1 @@ -163,13 +187,19 @@ define amdgpu_kernel void @test_class_1_f32(ptr addrspace(1) %out, float %a) #0 ; ; SI-GISEL-LABEL: test_class_1_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s3, s[4:5], 0xb +; SI-GISEL-NEXT: s_load_dword s0, s[4:5], 0xb +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_cmp_class_f32_e64 s[0:1], s0, 1 +; SI-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1] +; SI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-GISEL-NEXT: s_and_b32 s3, s2, 1 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_class_f32_e64 s[4:5], s3, 1 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; SI-GISEL-NEXT: s_cselect_b32 s4, -1, 0 ; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1 @@ -193,13 +223,19 @@ define amdgpu_kernel void @test_class_64_f32(ptr addrspace(1) %out, float %a) #0 ; ; SI-GISEL-LABEL: test_class_64_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s3, s[4:5], 0xb +; SI-GISEL-NEXT: s_load_dword s0, s[4:5], 0xb +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_cmp_class_f32_e64 s[0:1], s0, 64 +; SI-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1] +; SI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-GISEL-NEXT: s_and_b32 s3, s2, 1 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_class_f32_e64 s[4:5], s3, 64 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; SI-GISEL-NEXT: s_cselect_b32 s4, -1, 0 ; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1 @@ -225,14 +261,20 @@ define amdgpu_kernel void @test_class_full_mask_f32(ptr addrspace(1) %out, float ; ; SI-GISEL-LABEL: test_class_full_mask_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s3, s[4:5], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-GISEL-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff -; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_class_f32_e32 vcc, s3, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-GISEL-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 +; SI-GISEL-NEXT: s_or_b64 s[0:1], vcc, vcc +; SI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-GISEL-NEXT: s_and_b32 s3, s2, 1 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; SI-GISEL-NEXT: s_cselect_b32 s4, -1, 0 ; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1023) #1 @@ -257,14 +299,20 @@ define amdgpu_kernel void @test_class_9bit_mask_f32(ptr addrspace(1) %out, float ; ; SI-GISEL-LABEL: test_class_9bit_mask_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s3, s[4:5], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-GISEL-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x1ff -; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_class_f32_e32 vcc, s3, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-GISEL-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 +; SI-GISEL-NEXT: s_or_b64 s[0:1], vcc, vcc +; SI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-GISEL-NEXT: s_and_b32 s3, s2, 1 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; SI-GISEL-NEXT: s_cselect_b32 s4, -1, 0 ; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1 @@ -430,15 +478,21 @@ define amdgpu_kernel void @test_class_f64(ptr addrspace(1) %out, [8 x i32], doub ; ; SI-GISEL-LABEL: test_class_f64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s3, s[4:5], 0x1d -; SI-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x1d +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x13 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, s[0:1], v0 +; SI-GISEL-NEXT: s_or_b64 s[0:1], vcc, vcc +; SI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-GISEL-NEXT: s_and_b32 s3, s2, 1 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, s3 -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, s[6:7], v0 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; SI-GISEL-NEXT: s_cselect_b32 s4, -1, 0 ; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1 @@ -464,16 +518,21 @@ define amdgpu_kernel void @test_class_fabs_f64(ptr addrspace(1) %out, [8 x i32], ; ; SI-GISEL-LABEL: test_class_fabs_f64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 -; SI-GISEL-NEXT: s_load_dword s3, s[4:5], 0x1d +; SI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x1d +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x13 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; SI-GISEL-NEXT: v_cmp_class_f64_e64 s[0:1], |s[0:1]|, v0 +; SI-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1] +; SI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-GISEL-NEXT: s_and_b32 s3, s2, 1 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; SI-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], |v[0:1]|, s3 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; SI-GISEL-NEXT: s_cselect_b32 s4, -1, 0 ; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm %a.fabs = call double @llvm.fabs.f64(double %a) #1 @@ -500,15 +559,21 @@ define amdgpu_kernel void @test_class_fneg_f64(ptr addrspace(1) %out, [8 x i32], ; ; SI-GISEL-LABEL: test_class_fneg_f64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 -; SI-GISEL-NEXT: s_load_dword s3, s[4:5], 0x1d +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x13 +; SI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x1d +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_max_f64 v[0:1], -s[0:1], -s[0:1] +; SI-GISEL-NEXT: v_cmp_class_f64_e64 s[0:1], v[0:1], s2 +; SI-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1] +; SI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-GISEL-NEXT: s_and_b32 s3, s2, 1 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_max_f64 v[0:1], -s[6:7], -s[6:7] -; SI-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], s3 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; SI-GISEL-NEXT: s_cselect_b32 s4, -1, 0 ; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm %a.fneg = fsub double -0.0, %a @@ -535,15 +600,21 @@ define amdgpu_kernel void @test_class_fneg_fabs_f64(ptr addrspace(1) %out, [8 x ; ; SI-GISEL-LABEL: test_class_fneg_fabs_f64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 -; SI-GISEL-NEXT: s_load_dword s3, s[4:5], 0x1d +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x13 +; SI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x1d +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_max_f64 v[0:1], -|s[0:1]|, -|s[0:1]| +; SI-GISEL-NEXT: v_cmp_class_f64_e64 s[0:1], v[0:1], s2 +; SI-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1] +; SI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-GISEL-NEXT: s_and_b32 s3, s2, 1 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_max_f64 v[0:1], -|s[6:7]|, -|s[6:7]| -; SI-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], s3 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; SI-GISEL-NEXT: s_cselect_b32 s4, -1, 0 ; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm %a.fabs = call double @llvm.fabs.f64(double %a) #1 @@ -573,9 +644,14 @@ define amdgpu_kernel void @test_class_1_f64(ptr addrspace(1) %out, double %a) #0 ; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 1 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] +; SI-GISEL-NEXT: s_or_b64 s[2:3], s[2:3], s[2:3] +; SI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 +; SI-GISEL-NEXT: s_and_b32 s3, s2, 1 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; SI-GISEL-NEXT: s_cselect_b32 s4, -1, 0 ; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 1) #1 @@ -603,9 +679,14 @@ define amdgpu_kernel void @test_class_64_f64(ptr addrspace(1) %out, double %a) # ; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 64 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] +; SI-GISEL-NEXT: s_or_b64 s[2:3], s[2:3], s[2:3] +; SI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 +; SI-GISEL-NEXT: s_and_b32 s3, s2, 1 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; SI-GISEL-NEXT: s_cselect_b32 s4, -1, 0 ; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 64) #1 @@ -631,14 +712,20 @@ define amdgpu_kernel void @test_class_full_mask_f64(ptr addrspace(1) %out, [8 x ; ; SI-GISEL-LABEL: test_class_full_mask_f64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x13 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x1ff -; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, s[6:7], v0 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, s[0:1], v0 +; SI-GISEL-NEXT: s_or_b64 s[0:1], vcc, vcc +; SI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-GISEL-NEXT: s_and_b32 s3, s2, 1 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; SI-GISEL-NEXT: s_cselect_b32 s4, -1, 0 ; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1 @@ -671,16 +758,21 @@ define amdgpu_kernel void @v_test_class_full_mask_f64(ptr addrspace(1) %out, ptr ; SI-GISEL-LABEL: v_test_class_full_mask_f64: ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1ff ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, s[2:3], v1 +; SI-GISEL-NEXT: s_or_b64 s[2:3], vcc, vcc +; SI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x1ff +; SI-GISEL-NEXT: s_and_b32 s3, s2, 1 ; SI-GISEL-NEXT: s_mov_b32 s2, 0 -; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, s[4:5], v2 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; SI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; SI-GISEL-NEXT: s_cselect_b32 s4, -1, 0 ; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, s4 ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1132,6 +1224,8 @@ define amdgpu_kernel void @test_no_fold_or_class_f32_0(ptr addrspace(1) %out, pt ; SI-GISEL-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: v_cmp_class_f32_e64 s[2:3], s8, 8 ; SI-GISEL-NEXT: s_mov_b32 s6, -1 +; SI-GISEL-NEXT: s_or_b64 s[2:3], s[2:3], s[2:3] +; SI-GISEL-NEXT: s_cselect_b64 s[2:3], exec, 0 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 4 ; SI-GISEL-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] @@ -1166,13 +1260,19 @@ define amdgpu_kernel void @test_class_0_f32(ptr addrspace(1) %out, float %a) #0 ; ; SI-GISEL-LABEL: test_class_0_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s3, s[4:5], 0xb +; SI-GISEL-NEXT: s_load_dword s0, s[4:5], 0xb +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_cmp_class_f32_e64 s[0:1], s0, 0 +; SI-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1] +; SI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-GISEL-NEXT: s_and_b32 s3, s2, 1 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_class_f32_e64 s[4:5], s3, 0 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; SI-GISEL-NEXT: s_cselect_b32 s4, -1, 0 ; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 0) #1 @@ -1197,9 +1297,14 @@ define amdgpu_kernel void @test_class_0_f64(ptr addrspace(1) %out, double %a) #0 ; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 0 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] +; SI-GISEL-NEXT: s_or_b64 s[2:3], s[2:3], s[2:3] +; SI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 +; SI-GISEL-NEXT: s_and_b32 s3, s2, 1 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; SI-GISEL-NEXT: s_cselect_b32 s4, -1, 0 ; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 0) #1 @@ -1221,14 +1326,20 @@ define amdgpu_kernel void @test_class_undef_f32(ptr addrspace(1) %out, float %a, ; ; SI-GISEL-LABEL: test_class_undef_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s3, s[4:5], 0xc -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_load_dword s0, s[4:5], 0xc ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; SI-GISEL-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-GISEL-NEXT: s_or_b64 s[0:1], vcc, vcc +; SI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-GISEL-NEXT: s_and_b32 s3, s2, 1 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; SI-GISEL-NEXT: s_cselect_b32 s4, -1, 0 ; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f32(float poison, i32 %b) #1 |
