diff options
author | Dmitry Chernenkov <dmitryc@google.com> | 2024-09-25 14:50:04 +0000 |
---|---|---|
committer | Dmitry Chernenkov <dmitryc@google.com> | 2024-09-25 14:50:26 +0000 |
commit | 4cb61c20ef38c6020389a15e739bac929b15425a (patch) | |
tree | 6ff38b26d318507edc50b07b0a6e9b159f87748d | |
parent | 9a0e281e8ccfc57ed5a5754d320b710efad6d303 (diff) | |
download | llvm-4cb61c20ef38c6020389a15e739bac929b15425a.zip llvm-4cb61c20ef38c6020389a15e739bac929b15425a.tar.gz llvm-4cb61c20ef38c6020389a15e739bac929b15425a.tar.bz2 |
Revert "[NVPTX] deprecate nvvm.rotate.* intrinsics, cleanup funnel-shift handling (#107655)"
This reverts commit 9ac00b85e05d21be658d6aa0c91cbe05bb5dbde2.
-rw-r--r-- | llvm/docs/ReleaseNotes.rst | 6 | ||||
-rw-r--r-- | llvm/include/llvm/IR/IntrinsicsNVVM.td | 16 | ||||
-rw-r--r-- | llvm/lib/IR/AutoUpgrade.cpp | 184 | ||||
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 21 | ||||
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 197 | ||||
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 129 | ||||
-rw-r--r-- | llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll | 18 | ||||
-rw-r--r-- | llvm/test/CodeGen/NVPTX/rotate.ll | 433 | ||||
-rw-r--r-- | llvm/test/CodeGen/NVPTX/rotate_64.ll | 33 |
9 files changed, 574 insertions, 463 deletions
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 0784d93..05f5bd6 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -63,12 +63,6 @@ Changes to the LLVM IR * ``llvm.nvvm.bitcast.d2ll`` * ``llvm.nvvm.bitcast.ll2d`` -* Remove the following intrinsics which can be replaced with a funnel-shift: - - * ``llvm.nvvm.rotate.b32`` - * ``llvm.nvvm.rotate.right.b64`` - * ``llvm.nvvm.rotate.b64`` - Changes to LLVM infrastructure ------------------------------ diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index aa5294f..737dd60 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -4479,6 +4479,22 @@ def int_nvvm_sust_p_3d_v4i32_trap "llvm.nvvm.sust.p.3d.v4i32.trap">, ClangBuiltin<"__nvvm_sust_p_3d_v4i32_trap">; + +def int_nvvm_rotate_b32 + : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable], "llvm.nvvm.rotate.b32">, + ClangBuiltin<"__nvvm_rotate_b32">; + +def int_nvvm_rotate_b64 + : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable], "llvm.nvvm.rotate.b64">, + ClangBuiltin<"__nvvm_rotate_b64">; + +def int_nvvm_rotate_right_b64 + : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable], "llvm.nvvm.rotate.right.b64">, + ClangBuiltin<"__nvvm_rotate_right_b64">; + def int_nvvm_swap_lo_hi_b64 : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable], "llvm.nvvm.swap.lo.hi.b64">, diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 3390d65..02d1d9d 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1272,9 +1272,6 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, // nvvm.bitcast.{f2i,i2f,ll2d,d2ll} Expand = Name == "f2i" || Name == "i2f" || Name == "ll2d" || Name == "d2ll"; - else if (Name.consume_front("rotate.")) - // nvvm.rotate.{b32,b64,right.b64} - Expand = Name == "b32" || Name == "b64" || Name == "right.b64"; else Expand = false; @@ -2261,108 +2258,6 @@ void llvm::UpgradeInlineAsmString(std::string *AsmStr) { } } -static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI, - Function *F, IRBuilder<> &Builder) { - Value *Rep = nullptr; - - if (Name == "abs.i" || Name == "abs.ll") { - Value *Arg = CI->getArgOperand(0); - Value *Neg = Builder.CreateNeg(Arg, "neg"); - Value *Cmp = Builder.CreateICmpSGE( - Arg, llvm::Constant::getNullValue(Arg->getType()), "abs.cond"); - Rep = Builder.CreateSelect(Cmp, Arg, Neg, "abs"); - } else if (Name.starts_with("atomic.load.add.f32.p") || - Name.starts_with("atomic.load.add.f64.p")) { - Value *Ptr = CI->getArgOperand(0); - Value *Val = CI->getArgOperand(1); - Rep = Builder.CreateAtomicRMW(AtomicRMWInst::FAdd, Ptr, Val, MaybeAlign(), - AtomicOrdering::SequentiallyConsistent); - } else if (Name.consume_front("max.") && - (Name == "s" || Name == "i" || Name == "ll" || Name == "us" || - Name == "ui" || Name == "ull")) { - Value *Arg0 = CI->getArgOperand(0); - Value *Arg1 = CI->getArgOperand(1); - Value *Cmp = Name.starts_with("u") - ? Builder.CreateICmpUGE(Arg0, Arg1, "max.cond") - : Builder.CreateICmpSGE(Arg0, Arg1, "max.cond"); - Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "max"); - } else if (Name.consume_front("min.") && - (Name == "s" || Name == "i" || Name == "ll" || Name == "us" || - Name == "ui" || Name == "ull")) { - Value *Arg0 = CI->getArgOperand(0); - Value *Arg1 = CI->getArgOperand(1); - Value *Cmp = Name.starts_with("u") - ? Builder.CreateICmpULE(Arg0, Arg1, "min.cond") - : Builder.CreateICmpSLE(Arg0, Arg1, "min.cond"); - Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "min"); - } else if (Name == "clz.ll") { - // llvm.nvvm.clz.ll returns an i32, but llvm.ctlz.i64 returns an i64. - Value *Arg = CI->getArgOperand(0); - Value *Ctlz = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz, - {Arg->getType()}), - {Arg, Builder.getFalse()}, "ctlz"); - Rep = Builder.CreateTrunc(Ctlz, Builder.getInt32Ty(), "ctlz.trunc"); - } else if (Name == "popc.ll") { - // llvm.nvvm.popc.ll returns an i32, but llvm.ctpop.i64 returns an - // i64. - Value *Arg = CI->getArgOperand(0); - Value *Popc = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctpop, - {Arg->getType()}), - Arg, "ctpop"); - Rep = Builder.CreateTrunc(Popc, Builder.getInt32Ty(), "ctpop.trunc"); - } else if (Name == "h2f") { - Rep = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), Intrinsic::convert_from_fp16, - {Builder.getFloatTy()}), - CI->getArgOperand(0), "h2f"); - } else if (Name.consume_front("bitcast.") && - (Name == "f2i" || Name == "i2f" || Name == "ll2d" || - Name == "d2ll")) { - Rep = Builder.CreateBitCast(CI->getArgOperand(0), CI->getType()); - } else if (Name == "rotate.b32") { - Value *Arg = CI->getOperand(0); - Value *ShiftAmt = CI->getOperand(1); - Rep = Builder.CreateIntrinsic(Builder.getInt32Ty(), Intrinsic::fshl, - {Arg, Arg, ShiftAmt}); - } else if (Name == "rotate.b64") { - Type *Int64Ty = Builder.getInt64Ty(); - Value *Arg = CI->getOperand(0); - Value *ZExtShiftAmt = Builder.CreateZExt(CI->getOperand(1), Int64Ty); - Rep = Builder.CreateIntrinsic(Int64Ty, Intrinsic::fshl, - {Arg, Arg, ZExtShiftAmt}); - } else if (Name == "rotate.right.b64") { - Type *Int64Ty = Builder.getInt64Ty(); - Value *Arg = CI->getOperand(0); - Value *ZExtShiftAmt = Builder.CreateZExt(CI->getOperand(1), Int64Ty); - Rep = Builder.CreateIntrinsic(Int64Ty, Intrinsic::fshr, - {Arg, Arg, ZExtShiftAmt}); - } else { - Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name); - if (IID != Intrinsic::not_intrinsic && - !F->getReturnType()->getScalarType()->isBFloatTy()) { - rename(F); - Function *NewFn = Intrinsic::getDeclaration(F->getParent(), IID); - SmallVector<Value *, 2> Args; - for (size_t I = 0; I < NewFn->arg_size(); ++I) { - Value *Arg = CI->getArgOperand(I); - Type *OldType = Arg->getType(); - Type *NewType = NewFn->getArg(I)->getType(); - Args.push_back( - (OldType->isIntegerTy() && NewType->getScalarType()->isBFloatTy()) - ? Builder.CreateBitCast(Arg, NewType) - : Arg); - } - Rep = Builder.CreateCall(NewFn, Args); - if (F->getReturnType()->isIntegerTy()) - Rep = Builder.CreateBitCast(Rep, F->getReturnType()); - } - } - - return Rep; -} - static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, IRBuilder<> &Builder) { LLVMContext &C = F->getContext(); @@ -4313,8 +4208,85 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { if (!IsX86 && Name == "stackprotectorcheck") { Rep = nullptr; + } else if (IsNVVM && (Name == "abs.i" || Name == "abs.ll")) { + Value *Arg = CI->getArgOperand(0); + Value *Neg = Builder.CreateNeg(Arg, "neg"); + Value *Cmp = Builder.CreateICmpSGE( + Arg, llvm::Constant::getNullValue(Arg->getType()), "abs.cond"); + Rep = Builder.CreateSelect(Cmp, Arg, Neg, "abs"); + } else if (IsNVVM && (Name.starts_with("atomic.load.add.f32.p") || + Name.starts_with("atomic.load.add.f64.p"))) { + Value *Ptr = CI->getArgOperand(0); + Value *Val = CI->getArgOperand(1); + Rep = Builder.CreateAtomicRMW(AtomicRMWInst::FAdd, Ptr, Val, MaybeAlign(), + AtomicOrdering::SequentiallyConsistent); + } else if (IsNVVM && Name.consume_front("max.") && + (Name == "s" || Name == "i" || Name == "ll" || Name == "us" || + Name == "ui" || Name == "ull")) { + Value *Arg0 = CI->getArgOperand(0); + Value *Arg1 = CI->getArgOperand(1); + Value *Cmp = Name.starts_with("u") + ? Builder.CreateICmpUGE(Arg0, Arg1, "max.cond") + : Builder.CreateICmpSGE(Arg0, Arg1, "max.cond"); + Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "max"); + } else if (IsNVVM && Name.consume_front("min.") && + (Name == "s" || Name == "i" || Name == "ll" || Name == "us" || + Name == "ui" || Name == "ull")) { + Value *Arg0 = CI->getArgOperand(0); + Value *Arg1 = CI->getArgOperand(1); + Value *Cmp = Name.starts_with("u") + ? Builder.CreateICmpULE(Arg0, Arg1, "min.cond") + : Builder.CreateICmpSLE(Arg0, Arg1, "min.cond"); + Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "min"); + } else if (IsNVVM && Name == "clz.ll") { + // llvm.nvvm.clz.ll returns an i32, but llvm.ctlz.i64 returns an i64. + Value *Arg = CI->getArgOperand(0); + Value *Ctlz = Builder.CreateCall( + Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz, + {Arg->getType()}), + {Arg, Builder.getFalse()}, "ctlz"); + Rep = Builder.CreateTrunc(Ctlz, Builder.getInt32Ty(), "ctlz.trunc"); + } else if (IsNVVM && Name == "popc.ll") { + // llvm.nvvm.popc.ll returns an i32, but llvm.ctpop.i64 returns an + // i64. + Value *Arg = CI->getArgOperand(0); + Value *Popc = Builder.CreateCall( + Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctpop, + {Arg->getType()}), + Arg, "ctpop"); + Rep = Builder.CreateTrunc(Popc, Builder.getInt32Ty(), "ctpop.trunc"); } else if (IsNVVM) { - Rep = upgradeNVVMIntrinsicCall(Name, CI, F, Builder); + if (Name == "h2f") { + Rep = + Builder.CreateCall(Intrinsic::getDeclaration( + F->getParent(), Intrinsic::convert_from_fp16, + {Builder.getFloatTy()}), + CI->getArgOperand(0), "h2f"); + } else if (Name.consume_front("bitcast.") && + (Name == "f2i" || Name == "i2f" || Name == "ll2d" || + Name == "d2ll")) { + Rep = Builder.CreateBitCast(CI->getArgOperand(0), CI->getType()); + } else { + Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name); + if (IID != Intrinsic::not_intrinsic && + !F->getReturnType()->getScalarType()->isBFloatTy()) { + rename(F); + NewFn = Intrinsic::getDeclaration(F->getParent(), IID); + SmallVector<Value *, 2> Args; + for (size_t I = 0; I < NewFn->arg_size(); ++I) { + Value *Arg = CI->getArgOperand(I); + Type *OldType = Arg->getType(); + Type *NewType = NewFn->getArg(I)->getType(); + Args.push_back((OldType->isIntegerTy() && + NewType->getScalarType()->isBFloatTy()) + ? Builder.CreateBitCast(Arg, NewType) + : Arg); + } + Rep = Builder.CreateCall(NewFn, Args); + if (F->getReturnType()->isIntegerTy()) + Rep = Builder.CreateBitCast(Rep, F->getReturnType()); + } + } } else if (IsX86) { Rep = upgradeX86IntrinsicCall(Name, CI, F, Builder); } else if (IsARM) { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 8812136..2688834 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -594,13 +594,20 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); - setOperationAction({ISD::ROTL, ISD::ROTR}, - {MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64}, - Expand); - - if (STI.hasHWROT32()) - setOperationAction({ISD::FSHL, ISD::FSHR}, MVT::i32, Legal); - + // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs + // that don't have h/w rotation we lower them to multi-instruction assembly. + // See ROT*_sw in NVPTXIntrInfo.td + setOperationAction(ISD::ROTL, MVT::i64, Legal); + setOperationAction(ISD::ROTR, MVT::i64, Legal); + setOperationAction(ISD::ROTL, MVT::i32, Legal); + setOperationAction(ISD::ROTR, MVT::i32, Legal); + + setOperationAction(ISD::ROTL, MVT::i16, Expand); + setOperationAction(ISD::ROTL, MVT::v2i16, Expand); + setOperationAction(ISD::ROTR, MVT::i16, Expand); + setOperationAction(ISD::ROTR, MVT::v2i16, Expand); + setOperationAction(ISD::ROTL, MVT::i8, Expand); + setOperationAction(ISD::ROTR, MVT::i8, Expand); setOperationAction(ISD::BSWAP, MVT::i16, Expand); setOperationAction(ISD::BR_JT, MVT::Other, Custom); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index f6bbf4c..510e4b8 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1665,6 +1665,167 @@ def BREV64 : "brev.b64 \t$dst, $a;", [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>; +// +// Rotate: Use ptx shf instruction if available. +// + +// 32 bit r2 = rotl r1, n +// => +// r2 = shf.l r1, r1, n +def ROTL32imm_hw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 imm:$amt)))]>, + Requires<[hasHWROT32]>; + +def ROTL32reg_hw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, + Requires<[hasHWROT32]>; + +// 32 bit r2 = rotr r1, n +// => +// r2 = shf.r r1, r1, n +def ROTR32imm_hw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), + "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 imm:$amt)))]>, + Requires<[hasHWROT32]>; + +def ROTR32reg_hw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), + "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, + Requires<[hasHWROT32]>; + +// 32-bit software rotate by immediate. $amt2 should equal 32 - $amt1. +def ROT32imm_sw : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2), + "{{\n\t" + ".reg .b32 %lhs;\n\t" + ".reg .b32 %rhs;\n\t" + "shl.b32 \t%lhs, $src, $amt1;\n\t" + "shr.b32 \t%rhs, $src, $amt2;\n\t" + "add.u32 \t$dst, %lhs, %rhs;\n\t" + "}}", + []>; + +def SUB_FRM_32 : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32); +}]>; + +def : Pat<(rotl (i32 Int32Regs:$src), (i32 imm:$amt)), + (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, + Requires<[noHWROT32]>; +def : Pat<(rotr (i32 Int32Regs:$src), (i32 imm:$amt)), + (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>, + Requires<[noHWROT32]>; + +// 32-bit software rotate left by register. +def ROTL32reg_sw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), + "{{\n\t" + ".reg .b32 %lhs;\n\t" + ".reg .b32 %rhs;\n\t" + ".reg .b32 %amt2;\n\t" + "shl.b32 \t%lhs, $src, $amt;\n\t" + "sub.s32 \t%amt2, 32, $amt;\n\t" + "shr.b32 \t%rhs, $src, %amt2;\n\t" + "add.u32 \t$dst, %lhs, %rhs;\n\t" + "}}", + [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, + Requires<[noHWROT32]>; + +// 32-bit software rotate right by register. +def ROTR32reg_sw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), + "{{\n\t" + ".reg .b32 %lhs;\n\t" + ".reg .b32 %rhs;\n\t" + ".reg .b32 %amt2;\n\t" + "shr.b32 \t%lhs, $src, $amt;\n\t" + "sub.s32 \t%amt2, 32, $amt;\n\t" + "shl.b32 \t%rhs, $src, %amt2;\n\t" + "add.u32 \t$dst, %lhs, %rhs;\n\t" + "}}", + [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, + Requires<[noHWROT32]>; + +// 64-bit software rotate by immediate. $amt2 should equal 64 - $amt1. +def ROT64imm_sw : + NVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2), + "{{\n\t" + ".reg .b64 %lhs;\n\t" + ".reg .b64 %rhs;\n\t" + "shl.b64 \t%lhs, $src, $amt1;\n\t" + "shr.b64 \t%rhs, $src, $amt2;\n\t" + "add.u64 \t$dst, %lhs, %rhs;\n\t" + "}}", + []>; + +def SUB_FRM_64 : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32); +}]>; + +def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)), + (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>; +def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)), + (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>; + +// 64-bit software rotate left by register. +def ROTL64reg_sw : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), + "{{\n\t" + ".reg .b64 %lhs;\n\t" + ".reg .b64 %rhs;\n\t" + ".reg .u32 %amt2;\n\t" + "and.b32 \t%amt2, $amt, 63;\n\t" + "shl.b64 \t%lhs, $src, %amt2;\n\t" + "sub.u32 \t%amt2, 64, %amt2;\n\t" + "shr.b64 \t%rhs, $src, %amt2;\n\t" + "add.u64 \t$dst, %lhs, %rhs;\n\t" + "}}", + [(set Int64Regs:$dst, (rotl Int64Regs:$src, (i32 Int32Regs:$amt)))]>; + +def ROTR64reg_sw : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), + "{{\n\t" + ".reg .b64 %lhs;\n\t" + ".reg .b64 %rhs;\n\t" + ".reg .u32 %amt2;\n\t" + "and.b32 \t%amt2, $amt, 63;\n\t" + "shr.b64 \t%lhs, $src, %amt2;\n\t" + "sub.u32 \t%amt2, 64, %amt2;\n\t" + "shl.b64 \t%rhs, $src, %amt2;\n\t" + "add.u64 \t$dst, %lhs, %rhs;\n\t" + "}}", + [(set Int64Regs:$dst, (rotr Int64Regs:$src, (i32 Int32Regs:$amt)))]>; + +// +// Funnnel shift in clamp mode +// + +// Create SDNodes so they can be used in the DAG code, e.g. +// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts) +def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>; +def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>; + +def FUNSHFLCLAMP : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;", + [(set Int32Regs:$dst, + (FUN_SHFL_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>; + +def FUNSHFRCLAMP : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;", + [(set Int32Regs:$dst, + (FUN_SHFR_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>; // // BFE - bit-field extract @@ -3496,42 +3657,6 @@ def : Pat<(v2i16 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b))), def: Pat<(v2i16 (scalar_to_vector (i16 Int16Regs:$a))), (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; -// -// Funnel-Shift -// - -// Create SDNodes so they can be used in the DAG code, e.g. -// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts) -def fshl_clamp : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>; -def fshr_clamp : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>; - -// Funnel shift, requires >= sm_32. Does not trap if amt is out of range, so -// no side effects. -let hasSideEffects = false in { - multiclass ShfInst<string mode, SDNode op> { - def _i - : NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt), - "shf." # mode # ".b32 \t$dst, $lo, $hi, $amt;", - [(set Int32Regs:$dst, - (op (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 imm:$amt)))]>, - Requires<[hasHWROT32]>; - - def _r - : NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), - "shf." # mode # ".b32 \t$dst, $lo, $hi, $amt;", - [(set Int32Regs:$dst, - (op (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>, - Requires<[hasHWROT32]>; - } - - defm SHF_L_CLAMP : ShfInst<"l.clamp", fshl_clamp>; - defm SHF_R_CLAMP : ShfInst<"r.clamp", fshr_clamp>; - defm SHF_L_WRAP : ShfInst<"l.wrap", fshl>; - defm SHF_R_WRAP : ShfInst<"r.wrap", fshr>; -} - // Count leading zeros let hasSideEffects = false in { def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 2688cfb..56c5516 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2733,9 +2733,134 @@ def : Pat<(int_nvvm_read_ptx_sreg_envreg30), (MOV_SPECIAL ENVREG30)>; def : Pat<(int_nvvm_read_ptx_sreg_envreg31), (MOV_SPECIAL ENVREG31)>; +// rotate builtin support + +def ROTATE_B32_HW_IMM + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, i32imm:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, + (int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)))]>, + Requires<[hasHWROT32]> ; + +def ROTATE_B32_HW_REG + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, Int32Regs:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, + (int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[hasHWROT32]> ; + +def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)), + (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, + Requires<[noHWROT32]> ; + +def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt), + (ROTL32reg_sw Int32Regs:$src, Int32Regs:$amt)>, + Requires<[noHWROT32]> ; + +let hasSideEffects = false in { + def GET_LO_INT64 : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src), + !strconcat("{{\n\t", + ".reg .b32 %dummy;\n\t", + "mov.b64 \t{$dst,%dummy}, $src;\n\t", + "}}"), + []> ; + + def GET_HI_INT64 : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src), + !strconcat("{{\n\t", + ".reg .b32 %dummy;\n\t", + "mov.b64 \t{%dummy,$dst}, $src;\n\t", + "}}"), + []> ; +} + +let hasSideEffects = false in { + def PACK_TWO_INT32 + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi), + "mov.b64 \t$dst, {{$lo, $hi}};", []> ; +} + def : Pat<(int_nvvm_swap_lo_hi_b64 Int64Regs:$src), - (V2I32toI64 (I64toI32H Int64Regs:$src), - (I64toI32L Int64Regs:$src))> ; + (PACK_TWO_INT32 (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src))> ; + +// Funnel shift, requires >= sm_32. Does not trap if amt is out of range, so +// no side effects. +let hasSideEffects = false in { + def SHF_L_WRAP_B32_IMM + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt), + "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; + + def SHF_L_WRAP_B32_REG + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; + + def SHF_R_WRAP_B32_IMM + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt), + "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; + + def SHF_R_WRAP_B32_REG + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; +} + +// HW version of rotate 64 +def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)), + (PACK_TWO_INT32 + (SHF_L_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src), imm:$amt), + (SHF_L_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src), + (GET_HI_INT64 Int64Regs:$src), imm:$amt))>, + Requires<[hasHWROT32]>; + +def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt), + (PACK_TWO_INT32 + (SHF_L_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt), + (SHF_L_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src), + (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt))>, + Requires<[hasHWROT32]>; + + +def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)), + (PACK_TWO_INT32 + (SHF_R_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src), + (GET_HI_INT64 Int64Regs:$src), imm:$amt), + (SHF_R_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src), imm:$amt))>, + Requires<[hasHWROT32]>; + +def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt), + (PACK_TWO_INT32 + (SHF_R_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src), + (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt), + (SHF_R_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt))>, + Requires<[hasHWROT32]>; + +// SW version of rotate 64 +def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)), + (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>, + Requires<[noHWROT32]>; +def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt), + (ROTL64reg_sw Int64Regs:$src, Int32Regs:$amt)>, + Requires<[noHWROT32]>; +def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)), + (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>, + Requires<[noHWROT32]>; +def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt), + (ROTR64reg_sw Int64Regs:$src, Int32Regs:$amt)>, + Requires<[noHWROT32]>; + //----------------------------------- // Texture Intrinsics diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll index 43ac246..7e4a4d5 100644 --- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll +++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll @@ -31,10 +31,6 @@ declare float @llvm.nvvm.bitcast.i2f(i32) declare i64 @llvm.nvvm.bitcast.d2ll(double) declare double @llvm.nvvm.bitcast.ll2d(i64) -declare i32 @llvm.nvvm.rotate.b32(i32, i32) -declare i64 @llvm.nvvm.rotate.right.b64(i64, i32) -declare i64 @llvm.nvvm.rotate.b64(i64, i32) - ; CHECK-LABEL: @simple_upgrade define void @simple_upgrade(i32 %a, i64 %b, i16 %c) { ; CHECK: call i32 @llvm.bitreverse.i32(i32 %a) @@ -143,16 +139,4 @@ define void @bitcast(i32 %a, i64 %b, float %c, double %d) { %r4 = call double @llvm.nvvm.bitcast.ll2d(i64 %b) ret void -} - -; CHECK-LABEL: @rotate -define void @rotate(i32 %a, i64 %b) { -; CHECK: call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 6) -; CHECK: call i64 @llvm.fshr.i64(i64 %b, i64 %b, i64 7) -; CHECK: call i64 @llvm.fshl.i64(i64 %b, i64 %b, i64 8) -; - %r1 = call i32 @llvm.nvvm.rotate.b32(i32 %a, i32 6) - %r2 = call i64 @llvm.nvvm.rotate.right.b64(i64 %b, i32 7) - %r3 = call i64 @llvm.nvvm.rotate.b64(i64 %b, i32 8) - ret void -} +}
\ No newline at end of file diff --git a/llvm/test/CodeGen/NVPTX/rotate.ll b/llvm/test/CodeGen/NVPTX/rotate.ll index 9ec5bcd..20c7ae5 100644 --- a/llvm/test/CodeGen/NVPTX/rotate.ll +++ b/llvm/test/CodeGen/NVPTX/rotate.ll @@ -9,29 +9,26 @@ declare i32 @llvm.nvvm.rotate.b32(i32, i32) declare i64 @llvm.nvvm.rotate.b64(i64, i32) declare i64 @llvm.nvvm.rotate.right.b64(i64, i32) -declare i64 @llvm.fshl.i64(i64, i64, i64) -declare i64 @llvm.fshr.i64(i64, i64, i64) -declare i32 @llvm.fshl.i32(i32, i32, i32) -declare i32 @llvm.fshr.i32(i32, i32, i32) - - ; SM20: rotate32 ; SM35: rotate32 define i32 @rotate32(i32 %a, i32 %b) { ; SM20-LABEL: rotate32( ; SM20: { -; SM20-NEXT: .reg .b32 %r<9>; +; SM20-NEXT: .reg .b32 %r<4>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u32 %r1, [rotate32_param_0]; ; SM20-NEXT: ld.param.u32 %r2, [rotate32_param_1]; -; SM20-NEXT: and.b32 %r3, %r2, 31; -; SM20-NEXT: shl.b32 %r4, %r1, %r3; -; SM20-NEXT: neg.s32 %r5, %r2; -; SM20-NEXT: and.b32 %r6, %r5, 31; -; SM20-NEXT: shr.u32 %r7, %r1, %r6; -; SM20-NEXT: or.b32 %r8, %r4, %r7; -; SM20-NEXT: st.param.b32 [func_retval0+0], %r8; +; SM20-NEXT: { +; SM20-NEXT: .reg .b32 %lhs; +; SM20-NEXT: .reg .b32 %rhs; +; SM20-NEXT: .reg .b32 %amt2; +; SM20-NEXT: shl.b32 %lhs, %r1, %r2; +; SM20-NEXT: sub.s32 %amt2, 32, %r2; +; SM20-NEXT: shr.b32 %rhs, %r1, %amt2; +; SM20-NEXT: add.u32 %r3, %lhs, %rhs; +; SM20-NEXT: } +; SM20-NEXT: st.param.b32 [func_retval0+0], %r3; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotate32( @@ -53,36 +50,45 @@ define i32 @rotate32(i32 %a, i32 %b) { define i64 @rotate64(i64 %a, i32 %b) { ; SM20-LABEL: rotate64( ; SM20: { -; SM20-NEXT: .reg .b32 %r<5>; -; SM20-NEXT: .reg .b64 %rd<5>; +; SM20-NEXT: .reg .b32 %r<2>; +; SM20-NEXT: .reg .b64 %rd<3>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u64 %rd1, [rotate64_param_0]; ; SM20-NEXT: ld.param.u32 %r1, [rotate64_param_1]; -; SM20-NEXT: and.b32 %r2, %r1, 63; -; SM20-NEXT: shl.b64 %rd2, %rd1, %r2; -; SM20-NEXT: neg.s32 %r3, %r1; -; SM20-NEXT: and.b32 %r4, %r3, 63; -; SM20-NEXT: shr.u64 %rd3, %rd1, %r4; -; SM20-NEXT: or.b64 %rd4, %rd2, %rd3; -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM20-NEXT: { +; SM20-NEXT: .reg .b64 %lhs; +; SM20-NEXT: .reg .b64 %rhs; +; SM20-NEXT: .reg .u32 %amt2; +; SM20-NEXT: and.b32 %amt2, %r1, 63; +; SM20-NEXT: shl.b64 %lhs, %rd1, %amt2; +; SM20-NEXT: sub.u32 %amt2, 64, %amt2; +; SM20-NEXT: shr.b64 %rhs, %rd1, %amt2; +; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; +; SM20-NEXT: } +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotate64( ; SM35: { -; SM35-NEXT: .reg .b32 %r<5>; -; SM35-NEXT: .reg .b64 %rd<5>; +; SM35-NEXT: .reg .b32 %r<6>; +; SM35-NEXT: .reg .b64 %rd<3>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.u64 %rd1, [rotate64_param_0]; -; SM35-NEXT: ld.param.u32 %r1, [rotate64_param_1]; -; SM35-NEXT: and.b32 %r2, %r1, 63; -; SM35-NEXT: shl.b64 %rd2, %rd1, %r2; -; SM35-NEXT: neg.s32 %r3, %r1; -; SM35-NEXT: and.b32 %r4, %r3, 63; -; SM35-NEXT: shr.u64 %rd3, %rd1, %r4; -; SM35-NEXT: or.b64 %rd4, %rd2, %rd3; -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM35-NEXT: { +; SM35-NEXT: .reg .b32 %dummy; +; SM35-NEXT: mov.b64 {%dummy,%r1}, %rd1; +; SM35-NEXT: } +; SM35-NEXT: { +; SM35-NEXT: .reg .b32 %dummy; +; SM35-NEXT: mov.b64 {%r2,%dummy}, %rd1; +; SM35-NEXT: } +; SM35-NEXT: ld.param.u32 %r3, [rotate64_param_1]; +; SM35-NEXT: shf.l.wrap.b32 %r4, %r2, %r1, %r3; +; SM35-NEXT: shf.l.wrap.b32 %r5, %r1, %r2, %r3; +; SM35-NEXT: mov.b64 %rd2, {%r5, %r4}; +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; ; SM35-NEXT: ret; %val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 %b) ret i64 %val @@ -93,36 +99,45 @@ define i64 @rotate64(i64 %a, i32 %b) { define i64 @rotateright64(i64 %a, i32 %b) { ; SM20-LABEL: rotateright64( ; SM20: { -; SM20-NEXT: .reg .b32 %r<5>; -; SM20-NEXT: .reg .b64 %rd<5>; +; SM20-NEXT: .reg .b32 %r<2>; +; SM20-NEXT: .reg .b64 %rd<3>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u64 %rd1, [rotateright64_param_0]; ; SM20-NEXT: ld.param.u32 %r1, [rotateright64_param_1]; -; SM20-NEXT: and.b32 %r2, %r1, 63; -; SM20-NEXT: shr.u64 %rd2, %rd1, %r2; -; SM20-NEXT: neg.s32 %r3, %r1; -; SM20-NEXT: and.b32 %r4, %r3, 63; -; SM20-NEXT: shl.b64 %rd3, %rd1, %r4; -; SM20-NEXT: or.b64 %rd4, %rd2, %rd3; -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM20-NEXT: { +; SM20-NEXT: .reg .b64 %lhs; +; SM20-NEXT: .reg .b64 %rhs; +; SM20-NEXT: .reg .u32 %amt2; +; SM20-NEXT: and.b32 %amt2, %r1, 63; +; SM20-NEXT: shr.b64 %lhs, %rd1, %amt2; +; SM20-NEXT: sub.u32 %amt2, 64, %amt2; +; SM20-NEXT: shl.b64 %rhs, %rd1, %amt2; +; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; +; SM20-NEXT: } +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotateright64( ; SM35: { -; SM35-NEXT: .reg .b32 %r<5>; -; SM35-NEXT: .reg .b64 %rd<5>; +; SM35-NEXT: .reg .b32 %r<6>; +; SM35-NEXT: .reg .b64 %rd<3>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.u64 %rd1, [rotateright64_param_0]; -; SM35-NEXT: ld.param.u32 %r1, [rotateright64_param_1]; -; SM35-NEXT: and.b32 %r2, %r1, 63; -; SM35-NEXT: shr.u64 %rd2, %rd1, %r2; -; SM35-NEXT: neg.s32 %r3, %r1; -; SM35-NEXT: and.b32 %r4, %r3, 63; -; SM35-NEXT: shl.b64 %rd3, %rd1, %r4; -; SM35-NEXT: or.b64 %rd4, %rd2, %rd3; -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM35-NEXT: { +; SM35-NEXT: .reg .b32 %dummy; +; SM35-NEXT: mov.b64 {%r1,%dummy}, %rd1; +; SM35-NEXT: } +; SM35-NEXT: { +; SM35-NEXT: .reg .b32 %dummy; +; SM35-NEXT: mov.b64 {%dummy,%r2}, %rd1; +; SM35-NEXT: } +; SM35-NEXT: ld.param.u32 %r3, [rotateright64_param_1]; +; SM35-NEXT: shf.r.wrap.b32 %r4, %r2, %r1, %r3; +; SM35-NEXT: shf.r.wrap.b32 %r5, %r1, %r2, %r3; +; SM35-NEXT: mov.b64 %rd2, {%r5, %r4}; +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; ; SM35-NEXT: ret; %val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 %b) ret i64 %val @@ -133,14 +148,18 @@ define i64 @rotateright64(i64 %a, i32 %b) { define i32 @rotl0(i32 %x) { ; SM20-LABEL: rotl0( ; SM20: { -; SM20-NEXT: .reg .b32 %r<5>; +; SM20-NEXT: .reg .b32 %r<3>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u32 %r1, [rotl0_param_0]; -; SM20-NEXT: shr.u32 %r2, %r1, 24; -; SM20-NEXT: shl.b32 %r3, %r1, 8; -; SM20-NEXT: or.b32 %r4, %r3, %r2; -; SM20-NEXT: st.param.b32 [func_retval0+0], %r4; +; SM20-NEXT: { +; SM20-NEXT: .reg .b32 %lhs; +; SM20-NEXT: .reg .b32 %rhs; +; SM20-NEXT: shl.b32 %lhs, %r1, 8; +; SM20-NEXT: shr.b32 %rhs, %r1, 24; +; SM20-NEXT: add.u32 %r2, %lhs, %rhs; +; SM20-NEXT: } +; SM20-NEXT: st.param.b32 [func_retval0+0], %r2; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotl0( @@ -158,40 +177,51 @@ define i32 @rotl0(i32 %x) { ret i32 %t2 } +declare i64 @llvm.fshl.i64(i64, i64, i64) +declare i64 @llvm.fshr.i64(i64, i64, i64) + ; SM35: rotl64 define i64 @rotl64(i64 %a, i64 %n) { ; SM20-LABEL: rotl64( ; SM20: { -; SM20-NEXT: .reg .b32 %r<5>; -; SM20-NEXT: .reg .b64 %rd<5>; +; SM20-NEXT: .reg .b32 %r<2>; +; SM20-NEXT: .reg .b64 %rd<3>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u64 %rd1, [rotl64_param_0]; ; SM20-NEXT: ld.param.u32 %r1, [rotl64_param_1]; -; SM20-NEXT: and.b32 %r2, %r1, 63; -; SM20-NEXT: shl.b64 %rd2, %rd1, %r2; -; SM20-NEXT: neg.s32 %r3, %r1; -; SM20-NEXT: and.b32 %r4, %r3, 63; -; SM20-NEXT: shr.u64 %rd3, %rd1, %r4; -; SM20-NEXT: or.b64 %rd4, %rd2, %rd3; -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM20-NEXT: { +; SM20-NEXT: .reg .b64 %lhs; +; SM20-NEXT: .reg .b64 %rhs; +; SM20-NEXT: .reg .u32 %amt2; +; SM20-NEXT: and.b32 %amt2, %r1, 63; +; SM20-NEXT: shl.b64 %lhs, %rd1, %amt2; +; SM20-NEXT: sub.u32 %amt2, 64, %amt2; +; SM20-NEXT: shr.b64 %rhs, %rd1, %amt2; +; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; +; SM20-NEXT: } +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotl64( ; SM35: { -; SM35-NEXT: .reg .b32 %r<5>; -; SM35-NEXT: .reg .b64 %rd<5>; +; SM35-NEXT: .reg .b32 %r<2>; +; SM35-NEXT: .reg .b64 %rd<3>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.u64 %rd1, [rotl64_param_0]; ; SM35-NEXT: ld.param.u32 %r1, [rotl64_param_1]; -; SM35-NEXT: and.b32 %r2, %r1, 63; -; SM35-NEXT: shl.b64 %rd2, %rd1, %r2; -; SM35-NEXT: neg.s32 %r3, %r1; -; SM35-NEXT: and.b32 %r4, %r3, 63; -; SM35-NEXT: shr.u64 %rd3, %rd1, %r4; -; SM35-NEXT: or.b64 %rd4, %rd2, %rd3; -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM35-NEXT: { +; SM35-NEXT: .reg .b64 %lhs; +; SM35-NEXT: .reg .b64 %rhs; +; SM35-NEXT: .reg .u32 %amt2; +; SM35-NEXT: and.b32 %amt2, %r1, 63; +; SM35-NEXT: shl.b64 %lhs, %rd1, %amt2; +; SM35-NEXT: sub.u32 %amt2, 64, %amt2; +; SM35-NEXT: shr.b64 %rhs, %rd1, %amt2; +; SM35-NEXT: add.u64 %rd2, %lhs, %rhs; +; SM35-NEXT: } +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; ; SM35-NEXT: ret; %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %n) ret i64 %val @@ -201,26 +231,34 @@ define i64 @rotl64(i64 %a, i64 %n) { define i64 @rotl64_imm(i64 %a) { ; SM20-LABEL: rotl64_imm( ; SM20: { -; SM20-NEXT: .reg .b64 %rd<5>; +; SM20-NEXT: .reg .b64 %rd<3>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0]; -; SM20-NEXT: shr.u64 %rd2, %rd1, 62; -; SM20-NEXT: shl.b64 %rd3, %rd1, 2; -; SM20-NEXT: or.b64 %rd4, %rd3, %rd2; -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM20-NEXT: { +; SM20-NEXT: .reg .b64 %lhs; +; SM20-NEXT: .reg .b64 %rhs; +; SM20-NEXT: shl.b64 %lhs, %rd1, 2; +; SM20-NEXT: shr.b64 %rhs, %rd1, 62; +; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; +; SM20-NEXT: } +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotl64_imm( ; SM35: { -; SM35-NEXT: .reg .b64 %rd<5>; +; SM35-NEXT: .reg .b64 %rd<3>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0]; -; SM35-NEXT: shr.u64 %rd2, %rd1, 62; -; SM35-NEXT: shl.b64 %rd3, %rd1, 2; -; SM35-NEXT: or.b64 %rd4, %rd3, %rd2; -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM35-NEXT: { +; SM35-NEXT: .reg .b64 %lhs; +; SM35-NEXT: .reg .b64 %rhs; +; SM35-NEXT: shl.b64 %lhs, %rd1, 2; +; SM35-NEXT: shr.b64 %rhs, %rd1, 62; +; SM35-NEXT: add.u64 %rd2, %lhs, %rhs; +; SM35-NEXT: } +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; ; SM35-NEXT: ret; %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 66) ret i64 %val @@ -230,36 +268,44 @@ define i64 @rotl64_imm(i64 %a) { define i64 @rotr64(i64 %a, i64 %n) { ; SM20-LABEL: rotr64( ; SM20: { -; SM20-NEXT: .reg .b32 %r<5>; -; SM20-NEXT: .reg .b64 %rd<5>; +; SM20-NEXT: .reg .b32 %r<2>; +; SM20-NEXT: .reg .b64 %rd<3>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u64 %rd1, [rotr64_param_0]; ; SM20-NEXT: ld.param.u32 %r1, [rotr64_param_1]; -; SM20-NEXT: and.b32 %r2, %r1, 63; -; SM20-NEXT: shr.u64 %rd2, %rd1, %r2; -; SM20-NEXT: neg.s32 %r3, %r1; -; SM20-NEXT: and.b32 %r4, %r3, 63; -; SM20-NEXT: shl.b64 %rd3, %rd1, %r4; -; SM20-NEXT: or.b64 %rd4, %rd2, %rd3; -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM20-NEXT: { +; SM20-NEXT: .reg .b64 %lhs; +; SM20-NEXT: .reg .b64 %rhs; +; SM20-NEXT: .reg .u32 %amt2; +; SM20-NEXT: and.b32 %amt2, %r1, 63; +; SM20-NEXT: shr.b64 %lhs, %rd1, %amt2; +; SM20-NEXT: sub.u32 %amt2, 64, %amt2; +; SM20-NEXT: shl.b64 %rhs, %rd1, %amt2; +; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; +; SM20-NEXT: } +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotr64( ; SM35: { -; SM35-NEXT: .reg .b32 %r<5>; -; SM35-NEXT: .reg .b64 %rd<5>; +; SM35-NEXT: .reg .b32 %r<2>; +; SM35-NEXT: .reg .b64 %rd<3>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.u64 %rd1, [rotr64_param_0]; ; SM35-NEXT: ld.param.u32 %r1, [rotr64_param_1]; -; SM35-NEXT: and.b32 %r2, %r1, 63; -; SM35-NEXT: shr.u64 %rd2, %rd1, %r2; -; SM35-NEXT: neg.s32 %r3, %r1; -; SM35-NEXT: and.b32 %r4, %r3, 63; -; SM35-NEXT: shl.b64 %rd3, %rd1, %r4; -; SM35-NEXT: or.b64 %rd4, %rd2, %rd3; -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM35-NEXT: { +; SM35-NEXT: .reg .b64 %lhs; +; SM35-NEXT: .reg .b64 %rhs; +; SM35-NEXT: .reg .u32 %amt2; +; SM35-NEXT: and.b32 %amt2, %r1, 63; +; SM35-NEXT: shr.b64 %lhs, %rd1, %amt2; +; SM35-NEXT: sub.u32 %amt2, 64, %amt2; +; SM35-NEXT: shl.b64 %rhs, %rd1, %amt2; +; SM35-NEXT: add.u64 %rd2, %lhs, %rhs; +; SM35-NEXT: } +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; ; SM35-NEXT: ret; %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %n) ret i64 %val @@ -269,180 +315,35 @@ define i64 @rotr64(i64 %a, i64 %n) { define i64 @rotr64_imm(i64 %a) { ; SM20-LABEL: rotr64_imm( ; SM20: { -; SM20-NEXT: .reg .b64 %rd<5>; +; SM20-NEXT: .reg .b64 %rd<3>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0]; -; SM20-NEXT: shl.b64 %rd2, %rd1, 62; -; SM20-NEXT: shr.u64 %rd3, %rd1, 2; -; SM20-NEXT: or.b64 %rd4, %rd3, %rd2; -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM20-NEXT: { +; SM20-NEXT: .reg .b64 %lhs; +; SM20-NEXT: .reg .b64 %rhs; +; SM20-NEXT: shl.b64 %lhs, %rd1, 62; +; SM20-NEXT: shr.b64 %rhs, %rd1, 2; +; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; +; SM20-NEXT: } +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotr64_imm( ; SM35: { -; SM35-NEXT: .reg .b64 %rd<5>; +; SM35-NEXT: .reg .b64 %rd<3>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0]; -; SM35-NEXT: shl.b64 %rd2, %rd1, 62; -; SM35-NEXT: shr.u64 %rd3, %rd1, 2; -; SM35-NEXT: or.b64 %rd4, %rd3, %rd2; -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM35-NEXT: { +; SM35-NEXT: .reg .b64 %lhs; +; SM35-NEXT: .reg .b64 %rhs; +; SM35-NEXT: shl.b64 %lhs, %rd1, 62; +; SM35-NEXT: shr.b64 %rhs, %rd1, 2; +; SM35-NEXT: add.u64 %rd2, %lhs, %rhs; +; SM35-NEXT: } +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; ; SM35-NEXT: ret; %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 66) ret i64 %val } - -define i32 @funnel_shift_right_32(i32 %a, i32 %b, i32 %c) { -; SM20-LABEL: funnel_shift_right_32( -; SM20: { -; SM20-NEXT: .reg .b32 %r<11>; -; SM20-EMPTY: -; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_right_32_param_0]; -; SM20-NEXT: ld.param.u32 %r2, [funnel_shift_right_32_param_2]; -; SM20-NEXT: and.b32 %r3, %r2, 31; -; SM20-NEXT: ld.param.u32 %r4, [funnel_shift_right_32_param_1]; -; SM20-NEXT: shr.u32 %r5, %r4, %r3; -; SM20-NEXT: shl.b32 %r6, %r1, 1; -; SM20-NEXT: not.b32 %r7, %r2; -; SM20-NEXT: and.b32 %r8, %r7, 31; -; SM20-NEXT: shl.b32 %r9, %r6, %r8; -; SM20-NEXT: or.b32 %r10, %r9, %r5; -; SM20-NEXT: st.param.b32 [func_retval0+0], %r10; -; SM20-NEXT: ret; -; -; SM35-LABEL: funnel_shift_right_32( -; SM35: { -; SM35-NEXT: .reg .b32 %r<5>; -; SM35-EMPTY: -; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_right_32_param_0]; -; SM35-NEXT: ld.param.u32 %r2, [funnel_shift_right_32_param_1]; -; SM35-NEXT: ld.param.u32 %r3, [funnel_shift_right_32_param_2]; -; SM35-NEXT: shf.r.wrap.b32 %r4, %r1, %r2, %r3; -; SM35-NEXT: st.param.b32 [func_retval0+0], %r4; -; SM35-NEXT: ret; - %val = call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c) - ret i32 %val -} - -define i32 @funnel_shift_left_32(i32 %a, i32 %b, i32 %c) { -; SM20-LABEL: funnel_shift_left_32( -; SM20: { -; SM20-NEXT: .reg .b32 %r<11>; -; SM20-EMPTY: -; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_left_32_param_0]; -; SM20-NEXT: ld.param.u32 %r2, [funnel_shift_left_32_param_2]; -; SM20-NEXT: and.b32 %r3, %r2, 31; -; SM20-NEXT: shl.b32 %r4, %r1, %r3; -; SM20-NEXT: ld.param.u32 %r5, [funnel_shift_left_32_param_1]; -; SM20-NEXT: shr.u32 %r6, %r5, 1; -; SM20-NEXT: not.b32 %r7, %r2; -; SM20-NEXT: and.b32 %r8, %r7, 31; -; SM20-NEXT: shr.u32 %r9, %r6, %r8; -; SM20-NEXT: or.b32 %r10, %r4, %r9; -; SM20-NEXT: st.param.b32 [func_retval0+0], %r10; -; SM20-NEXT: ret; -; -; SM35-LABEL: funnel_shift_left_32( -; SM35: { -; SM35-NEXT: .reg .b32 %r<5>; -; SM35-EMPTY: -; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_left_32_param_0]; -; SM35-NEXT: ld.param.u32 %r2, [funnel_shift_left_32_param_1]; -; SM35-NEXT: ld.param.u32 %r3, [funnel_shift_left_32_param_2]; -; SM35-NEXT: shf.l.wrap.b32 %r4, %r1, %r2, %r3; -; SM35-NEXT: st.param.b32 [func_retval0+0], %r4; -; SM35-NEXT: ret; - %val = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c) - ret i32 %val -} - -define i64 @funnel_shift_right_64(i64 %a, i64 %b, i64 %c) { -; SM20-LABEL: funnel_shift_right_64( -; SM20: { -; SM20-NEXT: .reg .b32 %r<5>; -; SM20-NEXT: .reg .b64 %rd<7>; -; SM20-EMPTY: -; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u64 %rd1, [funnel_shift_right_64_param_0]; -; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_right_64_param_2]; -; SM20-NEXT: and.b32 %r2, %r1, 63; -; SM20-NEXT: ld.param.u64 %rd2, [funnel_shift_right_64_param_1]; -; SM20-NEXT: shr.u64 %rd3, %rd2, %r2; -; SM20-NEXT: shl.b64 %rd4, %rd1, 1; -; SM20-NEXT: not.b32 %r3, %r1; -; SM20-NEXT: and.b32 %r4, %r3, 63; -; SM20-NEXT: shl.b64 %rd5, %rd4, %r4; -; SM20-NEXT: or.b64 %rd6, %rd5, %rd3; -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd6; -; SM20-NEXT: ret; -; -; SM35-LABEL: funnel_shift_right_64( -; SM35: { -; SM35-NEXT: .reg .b32 %r<5>; -; SM35-NEXT: .reg .b64 %rd<7>; -; SM35-EMPTY: -; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u64 %rd1, [funnel_shift_right_64_param_0]; -; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_right_64_param_2]; -; SM35-NEXT: and.b32 %r2, %r1, 63; -; SM35-NEXT: ld.param.u64 %rd2, [funnel_shift_right_64_param_1]; -; SM35-NEXT: shr.u64 %rd3, %rd2, %r2; -; SM35-NEXT: shl.b64 %rd4, %rd1, 1; -; SM35-NEXT: not.b32 %r3, %r1; -; SM35-NEXT: and.b32 %r4, %r3, 63; -; SM35-NEXT: shl.b64 %rd5, %rd4, %r4; -; SM35-NEXT: or.b64 %rd6, %rd5, %rd3; -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd6; -; SM35-NEXT: ret; - %val = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c) - ret i64 %val -} - -define i64 @funnel_shift_left_64(i64 %a, i64 %b, i64 %c) { -; SM20-LABEL: funnel_shift_left_64( -; SM20: { -; SM20-NEXT: .reg .b32 %r<5>; -; SM20-NEXT: .reg .b64 %rd<7>; -; SM20-EMPTY: -; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u64 %rd1, [funnel_shift_left_64_param_0]; -; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_left_64_param_2]; -; SM20-NEXT: and.b32 %r2, %r1, 63; -; SM20-NEXT: shl.b64 %rd2, %rd1, %r2; -; SM20-NEXT: ld.param.u64 %rd3, [funnel_shift_left_64_param_1]; -; SM20-NEXT: shr.u64 %rd4, %rd3, 1; -; SM20-NEXT: not.b32 %r3, %r1; -; SM20-NEXT: and.b32 %r4, %r3, 63; -; SM20-NEXT: shr.u64 %rd5, %rd4, %r4; -; SM20-NEXT: or.b64 %rd6, %rd2, %rd5; -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd6; -; SM20-NEXT: ret; -; -; SM35-LABEL: funnel_shift_left_64( -; SM35: { -; SM35-NEXT: .reg .b32 %r<5>; -; SM35-NEXT: .reg .b64 %rd<7>; -; SM35-EMPTY: -; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u64 %rd1, [funnel_shift_left_64_param_0]; -; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_left_64_param_2]; -; SM35-NEXT: and.b32 %r2, %r1, 63; -; SM35-NEXT: shl.b64 %rd2, %rd1, %r2; -; SM35-NEXT: ld.param.u64 %rd3, [funnel_shift_left_64_param_1]; -; SM35-NEXT: shr.u64 %rd4, %rd3, 1; -; SM35-NEXT: not.b32 %r3, %r1; -; SM35-NEXT: and.b32 %r4, %r3, 63; -; SM35-NEXT: shr.u64 %rd5, %rd4, %r4; -; SM35-NEXT: or.b64 %rd6, %rd2, %rd5; -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd6; -; SM35-NEXT: ret; - %val = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %c) - ret i64 %val -} - diff --git a/llvm/test/CodeGen/NVPTX/rotate_64.ll b/llvm/test/CodeGen/NVPTX/rotate_64.ll index 05fdb02..64659ce 100644 --- a/llvm/test/CodeGen/NVPTX/rotate_64.ll +++ b/llvm/test/CodeGen/NVPTX/rotate_64.ll @@ -1,38 +1,25 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -march=nvptx64 | %ptxas-verify %} declare i64 @llvm.nvvm.rotate.b64(i64, i32) declare i64 @llvm.nvvm.rotate.right.b64(i64, i32) +; CHECK: rotate64 define i64 @rotate64(i64 %a, i32 %b) { -; CHECK-LABEL: rotate64( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [rotate64_param_0]; -; CHECK-NEXT: shr.u64 %rd2, %rd1, 61; -; CHECK-NEXT: shl.b64 %rd3, %rd1, 3; -; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; -; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd4; -; CHECK-NEXT: ret; +; CHECK: shl.b64 [[LHS:%.*]], [[RD1:%.*]], 3; +; CHECK: shr.b64 [[RHS:%.*]], [[RD1]], 61; +; CHECK: add.u64 [[RD2:%.*]], [[LHS]], [[RHS]]; +; CHECK: ret %val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 3) ret i64 %val } +; CHECK: rotateright64 define i64 @rotateright64(i64 %a, i32 %b) { -; CHECK-LABEL: rotateright64( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [rotateright64_param_0]; -; CHECK-NEXT: shl.b64 %rd2, %rd1, 61; -; CHECK-NEXT: shr.u64 %rd3, %rd1, 3; -; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; -; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd4; -; CHECK-NEXT: ret; +; CHECK: shl.b64 [[LHS:%.*]], [[RD1:%.*]], 61; +; CHECK: shr.b64 [[RHS:%.*]], [[RD1]], 3; +; CHECK: add.u64 [[RD2:%.*]], [[LHS]], [[RHS]]; +; CHECK: ret %val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 3) ret i64 %val } |