diff options
-rw-r--r-- | llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h | 11 | ||||
-rw-r--r-- | llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h | 16 | ||||
-rw-r--r-- | llvm/include/llvm/Target/GlobalISel/Combine.td | 17 | ||||
-rw-r--r-- | llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 10 | ||||
-rw-r--r-- | llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp | 48 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir | 131 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll | 38 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/ctlz.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 9 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/cttz.ll | 2 |
12 files changed, 263 insertions, 33 deletions
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 47365c3..05d7e88 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -129,6 +129,12 @@ public: const TargetLowering &getTargetLowering() const; + const MachineFunction &getMachineFunction() const; + + const DataLayout &getDataLayout() const; + + LLVMContext &getContext() const; + /// \returns true if the combiner is running pre-legalization. bool isPreLegalize() const; @@ -884,6 +890,9 @@ public: bool matchTruncateOfExt(const MachineInstr &Root, const MachineInstr &ExtMI, BuildFnTy &MatchInfo); + bool matchCastOfSelect(const MachineInstr &Cast, const MachineInstr &SelectMI, + BuildFnTy &MatchInfo); + private: /// Checks for legality of an indexed variant of \p LdSt. bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const; @@ -996,6 +1005,8 @@ private: // Simplify (cmp cc0 x, y) (&& or ||) (cmp cc1 x, y) -> cmp cc2 x, y. bool tryFoldLogicOfFCmps(GLogicalBinOp *Logic, BuildFnTy &MatchInfo); + + bool isCastFree(unsigned Opcode, LLT ToTy, LLT FromTy) const; }; } // namespace llvm diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h index 8b7e8c0..ef1171d 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h @@ -934,6 +934,22 @@ public: }; }; +/// Represents an integer-like extending or truncating operation. +class GExtOrTruncOp : public GCastOp { +public: + static bool classof(const MachineInstr *MI) { + switch (MI->getOpcode()) { + case TargetOpcode::G_SEXT: + case TargetOpcode::G_ZEXT: + case TargetOpcode::G_ANYEXT: + case TargetOpcode::G_TRUNC: + return true; + default: + return false; + } + }; +}; + } // namespace llvm #endif // LLVM_CODEGEN_GLOBALISEL_GENERICMACHINEINSTRS_H diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 2362e77b..2246e20 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1771,10 +1771,25 @@ def truncate_of_zext : truncate_of_opcode<G_ZEXT>; def truncate_of_sext : truncate_of_opcode<G_SEXT>; def truncate_of_anyext : truncate_of_opcode<G_ANYEXT>; +// Push cast through select. +class select_of_opcode<Instruction castOpcode> : GICombineRule < + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (G_SELECT $select, $cond, $true, $false):$Select, + (castOpcode $root, $select):$Cast, + [{ return Helper.matchCastOfSelect(*${Cast}, *${Select}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${Cast}, ${matchinfo}); }])>; + +def select_of_zext : select_of_opcode<G_ZEXT>; +def select_of_anyext : select_of_opcode<G_ANYEXT>; +def select_of_truncate : select_of_opcode<G_TRUNC>; + def cast_combines: GICombineGroup<[ truncate_of_zext, truncate_of_sext, - truncate_of_anyext + truncate_of_anyext, + select_of_zext, + select_of_anyext, + select_of_truncate ]>; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 8c05931..d930ab2 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -68,6 +68,16 @@ const TargetLowering &CombinerHelper::getTargetLowering() const { return *Builder.getMF().getSubtarget().getTargetLowering(); } +const MachineFunction &CombinerHelper::getMachineFunction() const { + return Builder.getMF(); +} + +const DataLayout &CombinerHelper::getDataLayout() const { + return getMachineFunction().getDataLayout(); +} + +LLVMContext &CombinerHelper::getContext() const { return Builder.getContext(); } + /// \returns The little endian in-memory byte position of byte \p I in a /// \p ByteWidth bytes wide type. /// diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp index d36685b..59295f7 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp @@ -161,3 +161,51 @@ bool CombinerHelper::matchTruncateOfExt(const MachineInstr &Root, return false; } + +bool CombinerHelper::isCastFree(unsigned Opcode, LLT ToTy, LLT FromTy) const { + const TargetLowering &TLI = getTargetLowering(); + const DataLayout &DL = getDataLayout(); + LLVMContext &Ctx = getContext(); + + switch (Opcode) { + case TargetOpcode::G_ANYEXT: + case TargetOpcode::G_ZEXT: + return TLI.isZExtFree(FromTy, ToTy, DL, Ctx); + case TargetOpcode::G_TRUNC: + return TLI.isTruncateFree(FromTy, ToTy, DL, Ctx); + default: + return false; + } +} + +bool CombinerHelper::matchCastOfSelect(const MachineInstr &CastMI, + const MachineInstr &SelectMI, + BuildFnTy &MatchInfo) { + const GExtOrTruncOp *Cast = cast<GExtOrTruncOp>(&CastMI); + const GSelect *Select = cast<GSelect>(&SelectMI); + + if (!MRI.hasOneNonDBGUse(Select->getReg(0))) + return false; + + Register Dst = Cast->getReg(0); + LLT DstTy = MRI.getType(Dst); + LLT CondTy = MRI.getType(Select->getCondReg()); + Register TrueReg = Select->getTrueReg(); + Register FalseReg = Select->getFalseReg(); + LLT SrcTy = MRI.getType(TrueReg); + Register Cond = Select->getCondReg(); + + if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SELECT, {DstTy, CondTy}})) + return false; + + if (!isCastFree(Cast->getOpcode(), DstTy, SrcTy)) + return false; + + MatchInfo = [=](MachineIRBuilder &B) { + auto True = B.buildInstr(Cast->getOpcode(), {DstTy}, {TrueReg}); + auto False = B.buildInstr(Cast->getOpcode(), {DstTy}, {FalseReg}); + B.buildSelect(Dst, Cond, True, False); + }; + + return true; +} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir new file mode 100644 index 0000000..0f43612 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir @@ -0,0 +1,131 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s | FileCheck %s --check-prefixes=CHECK,CHECK-PRE +# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-postlegalizer-combiner -verify-machineinstrs %s | FileCheck %s --check-prefixes=CHECK,CHECK-POST + +--- +name: test_combine_trunc_select +legalized: true +body: | + bb.1: + ; CHECK-PRE-LABEL: name: test_combine_trunc_select + ; CHECK-PRE: %cond:_(s32) = COPY $w0 + ; CHECK-PRE-NEXT: %lhs:_(s64) = COPY $x0 + ; CHECK-PRE-NEXT: %rhs:_(s64) = COPY $x0 + ; CHECK-PRE-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64) + ; CHECK-PRE-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC %rhs(s64) + ; CHECK-PRE-NEXT: %small:_(s32) = G_SELECT %cond(s32), [[TRUNC]], [[TRUNC1]] + ; CHECK-PRE-NEXT: $w0 = COPY %small(s32) + ; + ; CHECK-POST-LABEL: name: test_combine_trunc_select + ; CHECK-POST: %cond:_(s32) = COPY $w0 + ; CHECK-POST-NEXT: %lhs:_(s64) = COPY $x0 + ; CHECK-POST-NEXT: %rhs:_(s64) = COPY $x0 + ; CHECK-POST-NEXT: %res:_(s64) = G_SELECT %cond(s32), %lhs, %rhs + ; CHECK-POST-NEXT: %small:_(s32) = G_TRUNC %res(s64) + ; CHECK-POST-NEXT: $w0 = COPY %small(s32) + %cond:_(s32) = COPY $w0 + %lhs:_(s64) = COPY $x0 + %rhs:_(s64) = COPY $x0 + %res:_(s64) = G_SELECT %cond(s32), %lhs, %rhs + %small:_(s32) = G_TRUNC %res(s64) + $w0 = COPY %small(s32) +... +--- +name: test_combine_zext_select +legalized: true +body: | + bb.1: + ; CHECK-PRE-LABEL: name: test_combine_zext_select + ; CHECK-PRE: %cond:_(s32) = COPY $w0 + ; CHECK-PRE-NEXT: %lhs:_(s32) = COPY $w0 + ; CHECK-PRE-NEXT: %rhs:_(s32) = COPY $w0 + ; CHECK-PRE-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %lhs(s32) + ; CHECK-PRE-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT %rhs(s32) + ; CHECK-PRE-NEXT: %big:_(s64) = G_SELECT %cond(s32), [[ZEXT]], [[ZEXT1]] + ; CHECK-PRE-NEXT: $x0 = COPY %big(s64) + ; + ; CHECK-POST-LABEL: name: test_combine_zext_select + ; CHECK-POST: %cond:_(s32) = COPY $w0 + ; CHECK-POST-NEXT: %lhs:_(s32) = COPY $w0 + ; CHECK-POST-NEXT: %rhs:_(s32) = COPY $w0 + ; CHECK-POST-NEXT: %res:_(s32) = G_SELECT %cond(s32), %lhs, %rhs + ; CHECK-POST-NEXT: %big:_(s64) = G_ZEXT %res(s32) + ; CHECK-POST-NEXT: $x0 = COPY %big(s64) + %cond:_(s32) = COPY $w0 + %lhs:_(s32) = COPY $w0 + %rhs:_(s32) = COPY $w0 + %res:_(s32) = G_SELECT %cond(s32), %lhs, %rhs + %big:_(s64) = G_ZEXT %res(s32) + $x0 = COPY %big(s64) +... +--- +name: test_combine_anyzext_select +legalized: true +body: | + bb.1: + ; CHECK-PRE-LABEL: name: test_combine_anyzext_select + ; CHECK-PRE: %cond:_(s32) = COPY $w0 + ; CHECK-PRE-NEXT: %lhs:_(s32) = COPY $w0 + ; CHECK-PRE-NEXT: %rhs:_(s32) = COPY $w0 + ; CHECK-PRE-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT %lhs(s32) + ; CHECK-PRE-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT %rhs(s32) + ; CHECK-PRE-NEXT: %big:_(s64) = G_SELECT %cond(s32), [[ANYEXT]], [[ANYEXT1]] + ; CHECK-PRE-NEXT: $x0 = COPY %big(s64) + ; + ; CHECK-POST-LABEL: name: test_combine_anyzext_select + ; CHECK-POST: %cond:_(s32) = COPY $w0 + ; CHECK-POST-NEXT: %lhs:_(s32) = COPY $w0 + ; CHECK-POST-NEXT: %rhs:_(s32) = COPY $w0 + ; CHECK-POST-NEXT: %res:_(s32) = G_SELECT %cond(s32), %lhs, %rhs + ; CHECK-POST-NEXT: %big:_(s64) = G_ANYEXT %res(s32) + ; CHECK-POST-NEXT: $x0 = COPY %big(s64) + %cond:_(s32) = COPY $w0 + %lhs:_(s32) = COPY $w0 + %rhs:_(s32) = COPY $w0 + %res:_(s32) = G_SELECT %cond(s32), %lhs, %rhs + %big:_(s64) = G_ANYEXT %res(s32) + $x0 = COPY %big(s64) +... +--- +name: test_combine_anyzext_select_multi_use +legalized: true +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_anyzext_select_multi_use + ; CHECK: %cond:_(s32) = COPY $w0 + ; CHECK-NEXT: %lhs:_(s32) = COPY $w0 + ; CHECK-NEXT: %rhs:_(s32) = COPY $w0 + ; CHECK-NEXT: %res:_(s32) = G_SELECT %cond(s32), %lhs, %rhs + ; CHECK-NEXT: %big:_(s64) = G_ANYEXT %res(s32) + ; CHECK-NEXT: $x0 = COPY %big(s64) + ; CHECK-NEXT: $w0 = COPY %res(s32) + %cond:_(s32) = COPY $w0 + %lhs:_(s32) = COPY $w0 + %rhs:_(s32) = COPY $w0 + %res:_(s32) = G_SELECT %cond(s32), %lhs, %rhs + %big:_(s64) = G_ANYEXT %res(s32) + $x0 = COPY %big(s64) + $w0 = COPY %res(s32) +... +--- +name: test_combine_trunc_select_vector_out_of_budget +legalized: true +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_trunc_select_vector_out_of_budget + ; CHECK: %cond:_(<2 x s32>) = COPY $x0 + ; CHECK-NEXT: %arg1:_(s64) = COPY $x0 + ; CHECK-NEXT: %arg2:_(s64) = COPY $x0 + ; CHECK-NEXT: %bv:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64) + ; CHECK-NEXT: %bv2:_(<2 x s64>) = G_BUILD_VECTOR %arg2(s64), %arg1(s64) + ; CHECK-NEXT: %res:_(<2 x s64>) = G_SELECT %cond(<2 x s32>), %bv, %bv2 + ; CHECK-NEXT: %small:_(<2 x s32>) = G_TRUNC %res(<2 x s64>) + ; CHECK-NEXT: $x0 = COPY %small(<2 x s32>) + %cond:_(<2 x s32>) = COPY $x0 + %arg1:_(s64) = COPY $x0 + %arg2:_(s64) = COPY $x0 + %bv:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64) + %bv2:_(<2 x s64>) = G_BUILD_VECTOR %arg2(s64), %arg1(s64) + %res:_(<2 x s64>) = G_SELECT %cond(<2 x s32>), %bv, %bv2 + %small:_(<2 x s32>) = G_TRUNC %res(<2 x s64>) + $x0 = COPY %small(<2 x s32>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index ec832ed..63f5464 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -1845,39 +1845,37 @@ define amdgpu_ps i65 @s_ashr_i65(i65 inreg %value, i65 inreg %amount) { ; GCN-NEXT: s_lshr_b64 s[2:3], s[0:1], s3 ; GCN-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 ; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GCN-NEXT: s_ashr_i32 s8, s5, 31 +; GCN-NEXT: s_ashr_i32 s7, s5, 31 ; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], s10 ; GCN-NEXT: s_cmp_lg_u32 s11, 0 ; GCN-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; GCN-NEXT: s_cmp_lg_u32 s12, 0 -; GCN-NEXT: s_mov_b32 s9, s8 ; GCN-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_cselect_b64 s[2:3], s[6:7], s[8:9] +; GCN-NEXT: s_cselect_b32 s2, s6, s7 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_ashr_i65: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 -; GFX10PLUS-NEXT: s_sub_i32 s12, s3, 64 -; GFX10PLUS-NEXT: s_sub_i32 s8, 64, s3 +; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64 +; GFX10PLUS-NEXT: s_sub_i32 s2, 64, s3 ; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64 -; GFX10PLUS-NEXT: s_cselect_b32 s13, 1, 0 +; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0 ; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0 -; GFX10PLUS-NEXT: s_cselect_b32 s14, 1, 0 -; GFX10PLUS-NEXT: s_ashr_i64 s[6:7], s[4:5], s3 -; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[0:1], s3 -; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 -; GFX10PLUS-NEXT: s_ashr_i32 s10, s5, 31 -; GFX10PLUS-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX10PLUS-NEXT: s_ashr_i64 s[4:5], s[4:5], s12 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10PLUS-NEXT: s_mov_b32 s11, s10 -; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GFX10PLUS-NEXT: s_cmp_lg_u32 s14, 0 -; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GFX10PLUS-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[6:7], s[10:11] +; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0 +; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s3 +; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[4:5], s2 +; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[4:5], s3 +; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX10PLUS-NEXT: s_ashr_i32 s3, s5, 31 +; GFX10PLUS-NEXT: s_ashr_i64 s[4:5], s[4:5], s10 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] +; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0 +; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] +; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, s3 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i65 %value, %amount ret i65 %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 980ba3d..5dd4fa0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -1766,7 +1766,7 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) { ; GCN-NEXT: s_cmp_lg_u32 s12, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 +; GCN-NEXT: s_cselect_b32 s2, s6, 0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_i65: @@ -1788,7 +1788,7 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) { ; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, 0 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i65 %value, %amount ret i65 %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index c2f911c..4cf1c92 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -1733,9 +1733,9 @@ define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) { ; GCN-NEXT: s_lshl_b64 s[8:9], s[0:1], s10 ; GCN-NEXT: s_cmp_lg_u32 s11, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 -; GCN-NEXT: s_cselect_b64 s[4:5], s[6:7], s[8:9] +; GCN-NEXT: s_cselect_b32 s3, s6, s8 ; GCN-NEXT: s_cmp_lg_u32 s12, 0 -; GCN-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GCN-NEXT: s_cselect_b32 s2, s2, s3 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_shl_i65: @@ -1753,9 +1753,9 @@ define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) { ; GFX10PLUS-NEXT: s_lshl_b64 s[6:7], s[0:1], s10 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[8:9], 0 -; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX10PLUS-NEXT: s_cselect_b32 s3, s4, s6 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, s3 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i65 %value, %amount ret i65 %result diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index ba0a1e7..a0b5497 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -1593,7 +1593,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0xffff, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index a55c8cd..2168e7f 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -1706,11 +1706,12 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v0 -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa s[0:1], v0, v1 src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, s[0:1] +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v3, v3 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa vcc, v0, v1 src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll index 57fe6cd..14e6c4b 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -1359,7 +1359,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x100, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_sdwa s0, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, s0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0xffff, s0 ; GFX10-GISEL-NEXT: global_store_byte v2, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() |