diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir | 47 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 232 |
2 files changed, 165 insertions, 114 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir index 7748b48..fed277d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir @@ -81,12 +81,14 @@ body: | ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) - ; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR]](s32) - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]] - ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C1]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32) + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]] + ; CHECK-NEXT: $vgpr0 = COPY [[AND1]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s16) = G_TRUNC %0 %2:_(s16) = G_CTLZ_ZERO_UNDEF %1 @@ -147,15 +149,18 @@ body: | ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR1]](s32) - ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[C]](s32) - ; CHECK-NEXT: [[AMDGPU_FFBH_U321:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR2]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]] - ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U321]], [[C1]] - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]] + ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32) + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32) + ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[LSHR]](s32) + ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF1]], [[C]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32) + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]] ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 @@ -174,12 +179,14 @@ body: | ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 25 - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) - ; CHECK-NEXT: [[FFBH:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR]](s32) - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 127 - ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FFBH]], [[C1]] - ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 127 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 25 + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C1]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32) + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]] + ; CHECK-NEXT: $vgpr0 = COPY [[AND1]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s7) = G_TRUNC %0 %2:_(s7) = G_CTLZ_ZERO_UNDEF %1 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index d94a27e..54adde3 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -322,8 +322,9 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s2, s2, 24 -; SI-NEXT: s_flbit_i32_b32 s4, s2 +; SI-NEXT: s_and_b32 s2, s2, 0xff +; SI-NEXT: s_flbit_i32_b32 s2, s2 +; SI-NEXT: s_sub_i32 s4, s2, 24 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -334,8 +335,9 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s2, 24 +; VI-NEXT: s_and_b32 s2, s2, 0xff ; VI-NEXT: s_flbit_i32_b32 s2, s2 +; VI-NEXT: s_sub_i32 s2, s2, 24 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -355,13 +357,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T0.X, 0.0, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: FFBH_UINT T0.W, PV.W, +; EG-NEXT: FFBH_UINT T0.W, T0.X, ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, +; EG-NEXT: -24(nan), 0(0.000000e+00) ; EG-NEXT: AND_INT T0.W, PV.W, literal.x, -; EG-NEXT: LSHL * T1.W, PS, literal.y, +; EG-NEXT: LSHL * T1.W, T1.W, literal.y, ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) ; EG-NEXT: LSHL T0.X, PV.W, PS, ; EG-NEXT: LSHL * T0.W, literal.x, PS, @@ -377,8 +379,9 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshr_b32 s0, s4, 24 +; GFX9-GISEL-NEXT: s_and_b32 s0, s4, 0xff ; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0 +; GFX9-GISEL-NEXT: s_sub_i32 s0, s0, 24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm @@ -396,8 +399,9 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s2, s2, 16 -; SI-NEXT: s_flbit_i32_b32 s4, s2 +; SI-NEXT: s_and_b32 s2, s2, 0xffff +; SI-NEXT: s_flbit_i32_b32 s2, s2 +; SI-NEXT: s_add_i32 s4, s2, -16 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -430,13 +434,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T0.X, 0.0, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: FFBH_UINT T0.W, PV.W, +; EG-NEXT: FFBH_UINT T0.W, T0.X, ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, +; EG-NEXT: -16(nan), 0(0.000000e+00) ; EG-NEXT: AND_INT T0.W, PV.W, literal.x, -; EG-NEXT: LSHL * T1.W, PS, literal.y, +; EG-NEXT: LSHL * T1.W, T1.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) ; EG-NEXT: LSHL T0.X, PV.W, PS, ; EG-NEXT: LSHL * T0.W, literal.x, PS, @@ -452,8 +456,9 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshr_b32 s0, s4, 16 +; GFX9-GISEL-NEXT: s_and_b32 s0, s4, 0xffff ; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0 +; GFX9-GISEL-NEXT: s_sub_i32 s0, s0, 16 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-GISEL-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm @@ -593,8 +598,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v0 -; SI-NEXT: v_ffbh_u32_e32 v1, v1 +; SI-NEXT: v_ffbh_u32_e32 v1, v0 +; SI-NEXT: v_subrev_i32_e32 v1, vcc, 24, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 @@ -608,8 +613,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v0 -; VI-NEXT: v_ffbh_u32_e32 v1, v1 +; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; VI-NEXT: v_subrev_u32_e32 v1, vcc, 24, v1 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -621,7 +626,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -630,11 +635,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: FFBH_UINT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, -; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: FFBH_UINT * T0.W, T0.X, +; EG-NEXT: ADD_INT T0.W, PV.W, literal.x, +; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: -24(nan), 3(4.203895e-45) ; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T0.W, PV.W, literal.x, @@ -655,7 +659,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v1 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v2, 24, v2 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc @@ -688,8 +693,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_ffbh_u32_e32 v1, v1 +; SI-NEXT: v_ffbh_u32_e32 v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, -16, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -724,7 +729,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -733,11 +738,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: FFBH_UINT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, -; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: FFBH_UINT * T0.W, T0.X, +; EG-NEXT: ADD_INT T0.W, PV.W, literal.x, +; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: -16(nan), 3(4.203895e-45) ; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T0.W, PV.W, literal.x, @@ -760,7 +764,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 -; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v1 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v2, 16, v2 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc @@ -1105,8 +1110,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: v_ffbh_u32_e32 v0, v0 +; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -1119,8 +1124,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; VI-NEXT: v_ffbh_u32_e32 v2, v0 +; VI-NEXT: v_ffbh_u32_e32 v0, v0 +; VI-NEXT: v_subrev_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -1139,13 +1144,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: FFBH_UINT T0.W, PV.W, +; EG-NEXT: FFBH_UINT T0.W, T0.X, ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, +; EG-NEXT: -24(nan), 0(0.000000e+00) ; EG-NEXT: AND_INT T0.W, PV.W, literal.x, -; EG-NEXT: LSHL * T1.W, PS, literal.y, +; EG-NEXT: LSHL * T1.W, T1.W, literal.y, ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) ; EG-NEXT: LSHL T0.X, PV.W, PS, ; EG-NEXT: LSHL * T0.W, literal.x, PS, @@ -1167,7 +1172,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 24, v0 ; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1703,11 +1709,12 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v0 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa s[2:3], v0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, s[2:3] +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 24, v1 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2186,8 +2193,9 @@ define i7 @v_ctlz_zero_undef_i7(i7 %val) { ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i7: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v0, 25, v0 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 25, v0 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 true) ret i7 %ctlz @@ -2278,8 +2286,9 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshr_b32 s0, s4, 14 +; GFX9-GISEL-NEXT: s_and_b32 s0, s4, 0x3ffff ; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0 +; GFX9-GISEL-NEXT: s_sub_i32 s0, s0, 14 ; GFX9-GISEL-NEXT: s_and_b32 s0, s0, 0x3ffff ; GFX9-GISEL-NEXT: s_lshr_b32 s1, s0, 16 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s0 @@ -2317,8 +2326,9 @@ define i18 @v_ctlz_zero_undef_i18(i18 %val) { ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i18: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v0, 14, v0 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x3ffff, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 14, v0 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] %ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true) ret i18 %ctlz @@ -2355,10 +2365,12 @@ define <2 x i18> @v_ctlz_zero_undef_v2i18(<2 x i18> %val) { ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i18: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v0, 14, v0 -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 14, v1 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x3ffff, v0 +; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0x3ffff, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 14, v0 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 14, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] %ctlz = call <2 x i18> @llvm.ctlz.v2i18(<2 x i18> %val, i1 true) ret <2 x i18> %ctlz @@ -2368,12 +2380,16 @@ define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) { ; SI-LABEL: v_ctlz_zero_undef_v2i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_ffbh_u32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_ffbh_u32_e32 v0, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, -16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, -16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_ctlz_zero_undef_v2i16: @@ -2394,10 +2410,12 @@ define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) { ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i16: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 16, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s4, 0 -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, s4, 16, v0 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 16, v0 +; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] %ctlz = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %val, i1 true) ret <2 x i16> %ctlz @@ -2407,15 +2425,20 @@ define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) { ; SI-LABEL: v_ctlz_zero_undef_v3i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_ffbh_u32_e32 v1, v1 ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: v_ffbh_u32_e32 v3, v2 +; SI-NEXT: v_ffbh_u32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v2, 0x200000, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, -16, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, -16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0xfff00000, v0 +; SI-NEXT: v_or_b32_e32 v2, 0x100000, v2 ; SI-NEXT: v_alignbit_b32 v1, v3, v0, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2439,11 +2462,14 @@ define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) { ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v3i16: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v2, 16, v2 ; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s4, 0 -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, s4, 16, v0 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 16, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 16, v1 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] %ctlz = call <3 x i16> @llvm.ctlz.v3i16(<3 x i16> %val, i1 true) ret <3 x i16> %ctlz @@ -2453,18 +2479,24 @@ define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) { ; SI-LABEL: v_ctlz_zero_undef_v4i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_ffbh_u32_e32 v3, v3 ; SI-NEXT: v_ffbh_u32_e32 v2, v2 ; SI-NEXT: v_ffbh_u32_e32 v1, v1 ; SI-NEXT: v_ffbh_u32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, -16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, -16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0xfff00000, v0 ; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2492,13 +2524,18 @@ define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) { ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i16: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v2, 16, v2 ; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 16, v0 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v3, 16, v3 ; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s4, 0 -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, s4, 16, v0 -; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, s4, 16, v1 +; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 16, v1 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v1, 16, v2 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] %ctlz = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %val, i1 true) ret <4 x i16> %ctlz @@ -2508,24 +2545,27 @@ define <2 x i8> @v_ctlz_zero_undef_v2i8(<2 x i8> %val) { ; SI-LABEL: v_ctlz_zero_undef_v2i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_ffbh_u32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1 ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0xffffe800, v0 +; SI-NEXT: v_bfe_u32 v1, v0, 8, 8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_ctlz_zero_undef_v2i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; VI-NEXT: v_ffbh_u32_e32 v1, v1 -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v1 -; VI-NEXT: v_ffbh_u32_e32 v0, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0xe800, v1 +; VI-NEXT: v_subrev_u16_e32 v0, 24, v0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_ctlz_zero_undef_v2i8: @@ -2536,8 +2576,10 @@ define <2 x i8> @v_ctlz_zero_undef_v2i8(<2 x i8> %val) { ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i8: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 24, v0 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 24, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] %ctlz = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %val, i1 true) ret <2 x i8> %ctlz @@ -2579,10 +2621,12 @@ define <2 x i7> @v_ctlz_zero_undef_v2i7(<2 x i7> %val) { ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i7: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v0, 25, v0 -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 25, v1 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 25, v0 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 25, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] %ctlz = call <2 x i7> @llvm.ctlz.v2i7(<2 x i7> %val, i1 true) ret <2 x i7> %ctlz |