diff options
author | Shilei Tian <i@tianshilei.me> | 2025-06-23 13:06:28 -0400 |
---|---|---|
committer | Shilei Tian <i@tianshilei.me> | 2025-06-25 12:43:16 -0400 |
commit | d0efab140943c41b440b656008e10090f26fe5b9 (patch) | |
tree | 5be3920a585c5b21cada967d4b15e7cde9bb4daf | |
parent | 5a194c1fd97b10fdbdbd8ada85372d978c9ff3c4 (diff) | |
download | llvm-users/shiltian/aaamdgpuinreg.zip llvm-users/shiltian/aaamdgpuinreg.tar.gz llvm-users/shiltian/aaamdgpuinreg.tar.bz2 |
[WIP][AMDGPU][Attributor] Infer `inreg` attribute in `AMDGPUAttributor`users/shiltian/aaamdgpuinreg
-rw-r--r-- | llvm/include/llvm/IR/Argument.h | 2 | ||||
-rw-r--r-- | llvm/lib/IR/Function.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 197 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/aa-as-infer.ll | 16 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/aa-inreg-inference.ll | 296 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll | 5 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll | 15 |
7 files changed, 520 insertions, 15 deletions
diff --git a/llvm/include/llvm/IR/Argument.h b/llvm/include/llvm/IR/Argument.h index 60854b17..bbaea95 100644 --- a/llvm/include/llvm/IR/Argument.h +++ b/llvm/include/llvm/IR/Argument.h @@ -176,6 +176,8 @@ public: LLVM_ABI void removeAttrs(const AttributeMask &AM); + LLVM_ABI void removeAttr(StringRef Kind); + /// Check if an argument has a given attribute. LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const; diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index 28fb810..49263db 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -323,6 +323,10 @@ void Argument::removeAttr(Attribute::AttrKind Kind) { getParent()->removeParamAttr(getArgNo(), Kind); } +void Argument::removeAttr(StringRef Kind) { + getParent()->removeParamAttr(getArgNo(), Kind); +} + void Argument::removeAttrs(const AttributeMask &AM) { AttributeList AL = getParent()->getAttributes(); AL = AL.removeParamAttributes(Parent->getContext(), getArgNo(), AM); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index fef22c8..a9640d4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -14,7 +14,10 @@ #include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/Analysis/CycleAnalysis.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/UniformityAnalysis.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/InitializePasses.h" @@ -1295,6 +1298,134 @@ struct AAAMDGPUNoAGPR const char AAAMDGPUNoAGPR::ID = 0; +struct AAAMDGPUUniform : public StateWrapper<BooleanState, AbstractAttribute> { + using Base = StateWrapper<BooleanState, AbstractAttribute>; + AAAMDGPUUniform(const IRPosition &IRP, Attributor &A) : Base(IRP) {} + + /// Create an abstract attribute view for the position \p IRP. + static AAAMDGPUUniform &createForPosition(const IRPosition &IRP, + Attributor &A); + + /// See AbstractAttribute::getName() + StringRef getName() const override { return "AAAMDGPUUniform"; } + + const std::string getAsStr(Attributor *A) const override { + return getAssumed() ? "uniform" : "divergent"; + } + + void trackStatistics() const override {} + + /// See AbstractAttribute::getIdAddr() + const char *getIdAddr() const override { return &ID; } + + /// This function should return true if the type of the \p AA is + /// AAAMDGPUUniform + static bool classof(const AbstractAttribute *AA) { + return (AA->getIdAddr() == &ID); + } + + /// Unique ID (due to the unique address) + static const char ID; +}; + +const char AAAMDGPUUniform::ID = 0; + +/// This AA is to infer the inreg attribute for a function argument. +struct AAAMDGPUUniformArgument : public AAAMDGPUUniform { + AAAMDGPUUniformArgument(const IRPosition &IRP, Attributor &A) + : AAAMDGPUUniform(IRP, A) {} + + void initialize(Attributor &A) override { + Argument *Arg = getAssociatedArgument(); + CallingConv::ID CC = Arg->getParent()->getCallingConv(); + if (Arg->hasAttribute(Attribute::InReg)) { + indicateOptimisticFixpoint(); + return; + } + if (AMDGPU::isEntryFunctionCC(CC)) { + // We only use isArgPassedInSGPR on kernel entry function argument, so + // even if we will use VPGR for inreg i1 argument passing, it will not + // affect this. + if (AMDGPU::isArgPassedInSGPR(Arg)) + indicateOptimisticFixpoint(); + else + indicatePessimisticFixpoint(); + } + } + + ChangeStatus updateImpl(Attributor &A) override { + unsigned ArgNo = getAssociatedArgument()->getArgNo(); + + auto isUniform = [&](AbstractCallSite ACS) -> bool { + CallBase *CB = ACS.getInstruction(); + Value *V = CB->getArgOperandUse(ArgNo); + if (isa<Constant>(V)) + return true; + Function *F = nullptr; + if (auto *Arg = dyn_cast<Argument>(V)) { + auto *AA = + A.getOrCreateAAFor<AAAMDGPUUniform>(IRPosition::argument(*Arg)); + if (AA) + return AA->isValidState(); + F = Arg->getParent(); + } else if (auto *I = dyn_cast<Instruction>(V)) { + F = I->getFunction(); + } + + if (F) { + auto *UA = + A.getInfoCache() + .getAnalysisResultForFunction<UniformityInfoAnalysis>(*F); + return UA && UA->isUniform(V); + } + + return false; + }; + + bool UsedAssumedInformation = true; + if (!A.checkForAllCallSites(isUniform, *this, /*RequireAllCallSites=*/true, + UsedAssumedInformation)) + return indicatePessimisticFixpoint(); + + if (!UsedAssumedInformation) + return indicateOptimisticFixpoint(); + + return ChangeStatus::UNCHANGED; + } + + ChangeStatus manifest(Attributor &A) override { + Argument *Arg = getAssociatedArgument(); + // If the argument already has inreg attribute, we will not do anything + // about it. + if (Arg->hasAttribute(Attribute::InReg)) + return ChangeStatus::UNCHANGED; + if (AMDGPU::isEntryFunctionCC(Arg->getParent()->getCallingConv())) + return ChangeStatus::UNCHANGED; + // We don't directly emit readfirstlane here because it will cause multiple + // replacements of a single use in the manifest map, which is not supported + // at this moment. + // Add both inreg and "uniform" attribute to the argument. We will emit a + // readfirstlane at each call site for inreg uniform argument, and the + // "uniform" attribute will be removed later. + LLVMContext &Ctx = Arg->getContext(); + return A.manifestAttrs(getIRPosition(), + {Attribute::get(Ctx, Attribute::InReg), + Attribute::get(Ctx, "uniform")}); + } +}; + +AAAMDGPUUniform &AAAMDGPUUniform::createForPosition(const IRPosition &IRP, + Attributor &A) { + switch (IRP.getPositionKind()) { + case IRPosition::IRP_ARGUMENT: + return *new (A.Allocator) AAAMDGPUUniformArgument(IRP, A); + // TODO: Since inreg is also allowed for return value, maybe we need to add + // AAAMDGPUUniformCallSiteReturned? + default: + llvm_unreachable("not a valid position for AAAMDGPUUniform"); + } +} + /// Performs the final check and updates the 'amdgpu-waves-per-eu' attribute /// based on the finalized 'amdgpu-flat-work-group-size' attribute. /// Both attributes start with narrow ranges that expand during iteration. @@ -1363,6 +1494,64 @@ static bool updateWavesPerEU(Module &M, TargetMachine &TM) { return Changed; } +/// Emit the readfirstlane intrinsic for all inreg uniform function arguments at +/// each call site. The inreg uniform attribute combination is set by +/// AAAMDGPUUniform. This function provides a workaround for a downstream issue +/// where failing to emit a waterfall loop for 'inreg' arguments may result in +/// an invalid VGPR-to-SGPR copy. However, we intentionally avoid a waterfall +/// loop for inreg uniform arguments here, because the 'inreg' attribute set by +/// AAAMDGPUUniform guarantees uniformity, making the readfirstlane intrinsic +/// appropriate. +static bool emitReadFirstLaneForInregUniformArgs(Module &M) { + bool Changed = false; + std::vector<std::pair<CallBase *, unsigned>> WorkList; + + for (Function &F : M) { + if (F.isDeclaration()) + continue; + for (Argument &Arg : F.args()) { + if (!Arg.hasAttribute(Attribute::InReg) || !Arg.hasAttribute("uniform")) + continue; + unsigned ArgNo = Arg.getArgNo(); + for (Use &U : F.uses()) { + auto *CB = dyn_cast<CallBase>(U.getUser()); + if (!CB) + continue; + Value *CSArg = CB->getArgOperand(ArgNo); + // We don't need readfirstvalue for a global value. + if (isa<GlobalValue>(CSArg)) + continue; + // We will skip the call site argument when itself is an inreg argument. + // In this case, it will already be in SGPR. + if (auto *CSArgArg = dyn_cast<Argument>(CSArg)) { + if (CSArgArg->hasAttribute(Attribute::InReg)) + continue; + } + WorkList.emplace_back(CB, ArgNo); + } + Arg.removeAttr("uniform"); + Changed = true; + } + } + + if (WorkList.empty()) + return Changed; + + for (auto &[CB, ArgNo] : WorkList) { + Value *V = CB->getArgOperand(ArgNo); + IRBuilder<> Builder(CB); + Value *NewV = Builder.CreateIntrinsic(V->getType(), + Intrinsic::amdgcn_readfirstlane, {V}); + CB->setArgOperand(ArgNo, NewV); + if (auto *I = dyn_cast<Instruction>(V)) { + if (I->use_empty()) + I->eraseFromParent(); + } + } + + return true; +} + static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, AMDGPUAttributorOptions Options, ThinOrFullLTOPhase LTOPhase) { @@ -1381,7 +1570,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID, - &AAInstanceInfo::ID}); + &AAInstanceInfo::ID, &AAAMDGPUUniform::ID}); AttributorConfig AC(CGUpdater); AC.IsClosedWorldModule = Options.IsClosedWorld; @@ -1434,11 +1623,17 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, IRPosition::value(*CmpX->getPointerOperand())); } } + + if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) { + for (auto &Arg : F->args()) + A.getOrCreateAAFor<AAAMDGPUUniform>(IRPosition::argument(Arg)); + } } bool Changed = A.run() == ChangeStatus::CHANGED; Changed |= updateWavesPerEU(M, TM); + Changed |= emitReadFirstLaneForInregUniformArgs(M); return Changed; } diff --git a/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll b/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll index 78766b4..8648e89 100644 --- a/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll +++ b/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll @@ -90,7 +90,7 @@ define void @call_volatile_load_store_as_4(ptr addrspace(4) %p1, ptr addrspace(4 define internal void @can_infer_cmpxchg(ptr %word) { ; CHECK-LABEL: define internal void @can_infer_cmpxchg( -; CHECK-SAME: ptr [[WORD:%.*]]) #[[ATTR0]] { +; CHECK-SAME: ptr inreg [[WORD:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1) ; CHECK-NEXT: [[CMPXCHG_0:%.*]] = cmpxchg ptr addrspace(1) [[TMP1]], i32 0, i32 4 monotonic monotonic, align 4 ; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1) @@ -144,7 +144,7 @@ define internal void @can_not_infer_cmpxchg(ptr %word) { define internal void @can_infer_atomicrmw(ptr %word) { ; CHECK-LABEL: define internal void @can_infer_atomicrmw( -; CHECK-SAME: ptr [[WORD:%.*]]) #[[ATTR0]] { +; CHECK-SAME: ptr inreg [[WORD:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1) ; CHECK-NEXT: [[ATOMICRMW_XCHG:%.*]] = atomicrmw xchg ptr addrspace(1) [[TMP1]], i32 12 monotonic, align 4 ; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1) @@ -215,13 +215,17 @@ define void @foo(ptr addrspace(3) %val) { ; CHECK-LABEL: define void @foo( ; CHECK-SAME: ptr addrspace(3) [[VAL:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: [[VAL_CAST:%.*]] = addrspacecast ptr addrspace(3) [[VAL]] to ptr -; CHECK-NEXT: call void @can_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g1 to ptr)) -; CHECK-NEXT: call void @can_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g2 to ptr)) +; CHECK-NEXT: [[TMP1:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g1 to ptr)) +; CHECK-NEXT: call void @can_infer_cmpxchg(ptr [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g2 to ptr)) +; CHECK-NEXT: call void @can_infer_cmpxchg(ptr [[TMP2]]) ; CHECK-NEXT: call void @can_not_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g1 to ptr)) ; CHECK-NEXT: call void @can_not_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g2 to ptr)) ; CHECK-NEXT: call void @can_not_infer_cmpxchg(ptr [[VAL_CAST]]) -; CHECK-NEXT: call void @can_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g1 to ptr)) -; CHECK-NEXT: call void @can_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g2 to ptr)) +; CHECK-NEXT: [[TMP3:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g1 to ptr)) +; CHECK-NEXT: call void @can_infer_atomicrmw(ptr [[TMP3]]) +; CHECK-NEXT: [[TMP4:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g2 to ptr)) +; CHECK-NEXT: call void @can_infer_atomicrmw(ptr [[TMP4]]) ; CHECK-NEXT: call void @can_not_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g1 to ptr)) ; CHECK-NEXT: call void @can_not_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g2 to ptr)) ; CHECK-NEXT: call void @can_not_infer_atomicrmw(ptr [[VAL_CAST]]) diff --git a/llvm/test/CodeGen/AMDGPU/aa-inreg-inference.ll b/llvm/test/CodeGen/AMDGPU/aa-inreg-inference.ll new file mode 100644 index 0000000..5af2b82 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/aa-inreg-inference.ll @@ -0,0 +1,296 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s -o - | FileCheck %s + +@g1 = protected addrspace(1) externally_initialized global i32 0, align 4 +@g2 = protected addrspace(1) externally_initialized global i32 0, align 4 +@g3 = protected addrspace(1) externally_initialized global i32 0, align 4 +@g4 = protected addrspace(1) externally_initialized global i32 0, align 4 + +;. +; CHECK: @g1 = protected addrspace(1) externally_initialized global i32 0, align 4 +; CHECK: @g2 = protected addrspace(1) externally_initialized global i32 0, align 4 +; CHECK: @g3 = protected addrspace(1) externally_initialized global i32 0, align 4 +; CHECK: @g4 = protected addrspace(1) externally_initialized global i32 0, align 4 +;. +define internal fastcc void @callee_infer(ptr addrspace(1) %x, i32 %y) { +; CHECK-LABEL: define {{[^@]+}}@callee_infer +; CHECK-SAME: (ptr addrspace(1) inreg [[X:%.*]], i32 inreg [[Y:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4 +; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g3, align 4 +; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g4, align 4 +; CHECK-NEXT: ret void +; +entry: + %x.val = load i32, ptr addrspace(1) %x, align 4 + store i32 %x.val, ptr addrspace(1) @g3, align 4 + store i32 %y, ptr addrspace(1) @g4, align 4 + ret void +} + +define amdgpu_kernel void @kernel_infer(ptr addrspace(1) %p1, ptr addrspace(1) %p2, i32 %x) { +; CHECK-LABEL: define {{[^@]+}}@kernel_infer +; CHECK-SAME: (ptr addrspace(1) [[P1:%.*]], ptr addrspace(1) [[P2:%.*]], i32 [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X]], 0 +; CHECK-NEXT: [[P:%.*]] = select i1 [[CMP]], ptr addrspace(1) [[P1]], ptr addrspace(1) [[P2]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[X]]) +; CHECK-NEXT: tail call fastcc void @callee_infer(ptr addrspace(1) @g1, i32 [[TMP0]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[X]]) +; CHECK-NEXT: tail call fastcc void @callee_infer(ptr addrspace(1) @g2, i32 [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 1) +; CHECK-NEXT: tail call fastcc void @callee_infer(ptr addrspace(1) @g1, i32 [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 2) +; CHECK-NEXT: tail call fastcc void @callee_infer(ptr addrspace(1) @g2, i32 [[TMP3]]) +; CHECK-NEXT: [[TMP4:%.*]] = call ptr addrspace(1) @llvm.amdgcn.readfirstlane.p1(ptr addrspace(1) [[P]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[X]]) +; CHECK-NEXT: tail call fastcc void @callee_infer(ptr addrspace(1) [[TMP4]], i32 [[TMP5]]) +; CHECK-NEXT: ret void +; +entry: + %cmp = icmp sgt i32 %x, 0 + %p = select i1 %cmp, ptr addrspace(1) %p1, ptr addrspace(1) %p2 + tail call fastcc void @callee_infer(ptr addrspace(1) @g1, i32 %x) + tail call fastcc void @callee_infer(ptr addrspace(1) @g2, i32 %x) + tail call fastcc void @callee_infer(ptr addrspace(1) @g1, i32 1) + tail call fastcc void @callee_infer(ptr addrspace(1) @g2, i32 2) + tail call fastcc void @callee_infer(ptr addrspace(1) %p, i32 %x) + ret void +} + +define amdgpu_kernel void @kernel_infer_indirect(ptr addrspace(1) %p1, ptr addrspace(1) %p2, i32 %x) { +; CHECK-LABEL: define {{[^@]+}}@kernel_infer_indirect +; CHECK-SAME: (ptr addrspace(1) [[P1:%.*]], ptr addrspace(1) [[P2:%.*]], i32 [[X:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[FN:%.*]] = alloca ptr, align 8, addrspace(5) +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X]], 0 +; CHECK-NEXT: [[P:%.*]] = select i1 [[CMP]], ptr addrspace(1) [[P1]], ptr addrspace(1) [[P2]] +; CHECK-NEXT: store ptr @kernel_infer, ptr addrspace(5) [[FN]], align 8 +; CHECK-NEXT: [[FN_CAST:%.*]] = addrspacecast ptr addrspace(5) [[FN]] to ptr +; CHECK-NEXT: tail call fastcc void [[FN_CAST]](ptr addrspace(1) @g1, i32 [[X]]) +; CHECK-NEXT: tail call fastcc void [[FN_CAST]](ptr addrspace(1) @g2, i32 [[X]]) +; CHECK-NEXT: tail call fastcc void [[FN_CAST]](ptr addrspace(1) @g1, i32 1) +; CHECK-NEXT: tail call fastcc void [[FN_CAST]](ptr addrspace(1) @g2, i32 2) +; CHECK-NEXT: tail call fastcc void [[FN_CAST]](ptr addrspace(1) [[P]], i32 [[X]]) +; CHECK-NEXT: ret void +; +entry: + %fn = alloca ptr, addrspace(5) + %cmp = icmp sgt i32 %x, 0 + %p = select i1 %cmp, ptr addrspace(1) %p1, ptr addrspace(1) %p2 + store ptr @kernel_infer, ptr addrspace(5) %fn + %fn.cast = addrspacecast ptr addrspace(5) %fn to ptr + tail call fastcc void %fn.cast(ptr addrspace(1) @g1, i32 %x) + tail call fastcc void %fn.cast(ptr addrspace(1) @g2, i32 %x) + tail call fastcc void %fn.cast(ptr addrspace(1) @g1, i32 1) + tail call fastcc void %fn.cast(ptr addrspace(1) @g2, i32 2) + tail call fastcc void %fn.cast(ptr addrspace(1) %p, i32 %x) + ret void +} + +define internal fastcc void @callee_not_infer(ptr addrspace(1) %x, i32 %y) { +; CHECK-LABEL: define {{[^@]+}}@callee_not_infer +; CHECK-SAME: (ptr addrspace(1) [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4 +; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g3, align 4 +; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g4, align 4 +; CHECK-NEXT: ret void +; +entry: + %x.val = load i32, ptr addrspace(1) %x, align 4 + store i32 %x.val, ptr addrspace(1) @g3, align 4 + store i32 %y, ptr addrspace(1) @g4, align 4 + ret void +} + +define amdgpu_kernel void @kernel_not_infer(ptr addrspace(1) %q, ptr addrspace(1) %p1, ptr addrspace(1) %p2) { +; CHECK-LABEL: define {{[^@]+}}@kernel_not_infer +; CHECK-SAME: (ptr addrspace(1) [[Q:%.*]], ptr addrspace(1) [[P1:%.*]], ptr addrspace(1) [[P2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[Q]], i32 [[ID_X]] +; CHECK-NEXT: [[D:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[D]], 0 +; CHECK-NEXT: [[P:%.*]] = select i1 [[CMP]], ptr addrspace(1) [[P1]], ptr addrspace(1) [[P2]] +; CHECK-NEXT: tail call fastcc void @callee_not_infer(ptr addrspace(1) [[Q]], i32 [[ID_X]]) +; CHECK-NEXT: tail call fastcc void @callee_not_infer(ptr addrspace(1) [[P]], i32 [[ID_X]]) +; CHECK-NEXT: ret void +; +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, ptr addrspace(1) %q, i32 %id.x + %d = load i32, ptr addrspace(1) %gep + %cmp = icmp sgt i32 %d, 0 + %p = select i1 %cmp, ptr addrspace(1) %p1, ptr addrspace(1) %p2 + tail call fastcc void @callee_not_infer(ptr addrspace(1) %q, i32 %id.x) + tail call fastcc void @callee_not_infer(ptr addrspace(1) %p, i32 %id.x) + ret void +} + +define amdgpu_kernel void @kernel_not_infer_indirect(ptr addrspace(1) %q, ptr addrspace(1) %p1, ptr addrspace(1) %p2) { +; CHECK-LABEL: define {{[^@]+}}@kernel_not_infer_indirect +; CHECK-SAME: (ptr addrspace(1) [[Q:%.*]], ptr addrspace(1) [[P1:%.*]], ptr addrspace(1) [[P2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[FN:%.*]] = alloca ptr, align 8, addrspace(5) +; CHECK-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[Q]], i32 [[ID_X]] +; CHECK-NEXT: [[D:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[D]], 0 +; CHECK-NEXT: [[P:%.*]] = select i1 [[CMP]], ptr addrspace(1) [[P1]], ptr addrspace(1) [[P2]] +; CHECK-NEXT: store ptr @kernel_not_infer, ptr addrspace(5) [[FN]], align 8 +; CHECK-NEXT: [[FN_CAST:%.*]] = addrspacecast ptr addrspace(5) [[FN]] to ptr +; CHECK-NEXT: tail call fastcc void [[FN_CAST]](ptr addrspace(1) [[Q]], i32 [[ID_X]]) +; CHECK-NEXT: tail call fastcc void [[FN_CAST]](ptr addrspace(1) [[P]], i32 [[ID_X]]) +; CHECK-NEXT: ret void +; +entry: + %fn = alloca ptr, addrspace(5) + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, ptr addrspace(1) %q, i32 %id.x + %d = load i32, ptr addrspace(1) %gep + %cmp = icmp sgt i32 %d, 0 + %p = select i1 %cmp, ptr addrspace(1) %p1, ptr addrspace(1) %p2 + store ptr @kernel_not_infer, ptr addrspace(5) %fn + %fn.cast = addrspacecast ptr addrspace(5) %fn to ptr + tail call fastcc void %fn.cast(ptr addrspace(1) %q, i32 %id.x) + tail call fastcc void %fn.cast(ptr addrspace(1) %p, i32 %id.x) + ret void +} + +define internal fastcc void @cs_callee_not_infer(ptr addrspace(1) %x, i32 %y) { +; CHECK-LABEL: define {{[^@]+}}@cs_callee_not_infer +; CHECK-SAME: (ptr addrspace(1) [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4 +; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g3, align 4 +; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g4, align 4 +; CHECK-NEXT: ret void +; +entry: + %x.val = load i32, ptr addrspace(1) %x, align 4 + store i32 %x.val, ptr addrspace(1) @g3, align 4 + store i32 %y, ptr addrspace(1) @g4, align 4 + ret void +} + +define amdgpu_cs void @cs_kernel_not_infer(ptr addrspace(1) %q, ptr addrspace(1) %p1, ptr addrspace(1) %p2) { +; CHECK-LABEL: define {{[^@]+}}@cs_kernel_not_infer +; CHECK-SAME: (ptr addrspace(1) [[Q:%.*]], ptr addrspace(1) [[P1:%.*]], ptr addrspace(1) [[P2:%.*]]) #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[Q]], i32 [[ID_X]] +; CHECK-NEXT: [[D:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[D]], 0 +; CHECK-NEXT: [[P:%.*]] = select i1 [[CMP]], ptr addrspace(1) [[P1]], ptr addrspace(1) [[P2]] +; CHECK-NEXT: tail call fastcc void @cs_callee_not_infer(ptr addrspace(1) [[Q]], i32 [[ID_X]]) +; CHECK-NEXT: tail call fastcc void @cs_callee_not_infer(ptr addrspace(1) [[P]], i32 [[ID_X]]) +; CHECK-NEXT: ret void +; +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, ptr addrspace(1) %q, i32 %id.x + %d = load i32, ptr addrspace(1) %gep + %cmp = icmp sgt i32 %d, 0 + %p = select i1 %cmp, ptr addrspace(1) %p1, ptr addrspace(1) %p2 + tail call fastcc void @cs_callee_not_infer(ptr addrspace(1) %q, i32 %id.x) + tail call fastcc void @cs_callee_not_infer(ptr addrspace(1) %p, i32 %id.x) + ret void +} + +define internal fastcc void @cs_callee_not_infer_indirect(ptr addrspace(1) %x, i32 %y) { +; CHECK-LABEL: define {{[^@]+}}@cs_callee_not_infer_indirect +; CHECK-SAME: (ptr addrspace(1) [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4 +; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g3, align 4 +; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g4, align 4 +; CHECK-NEXT: ret void +; +entry: + %x.val = load i32, ptr addrspace(1) %x, align 4 + store i32 %x.val, ptr addrspace(1) @g3, align 4 + store i32 %y, ptr addrspace(1) @g4, align 4 + ret void +} + + +define amdgpu_cs void @cs_kernel_not_infer_indirect(ptr addrspace(1) %q, ptr addrspace(1) %p1, ptr addrspace(1) %p2) { +; CHECK-LABEL: define {{[^@]+}}@cs_kernel_not_infer_indirect +; CHECK-SAME: (ptr addrspace(1) [[Q:%.*]], ptr addrspace(1) [[P1:%.*]], ptr addrspace(1) [[P2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[FN:%.*]] = alloca ptr, align 8, addrspace(5) +; CHECK-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[Q]], i32 [[ID_X]] +; CHECK-NEXT: [[D:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[D]], 0 +; CHECK-NEXT: [[P:%.*]] = select i1 [[CMP]], ptr addrspace(1) [[P1]], ptr addrspace(1) [[P2]] +; CHECK-NEXT: store ptr @cs_callee_not_infer_indirect, ptr addrspace(5) [[FN]], align 8 +; CHECK-NEXT: [[FN_CAST:%.*]] = addrspacecast ptr addrspace(5) [[FN]] to ptr +; CHECK-NEXT: tail call fastcc void [[FN_CAST]](ptr addrspace(1) [[Q]], i32 [[ID_X]]) +; CHECK-NEXT: tail call fastcc void [[FN_CAST]](ptr addrspace(1) [[P]], i32 [[ID_X]]) +; CHECK-NEXT: ret void +; +entry: + %fn = alloca ptr, addrspace(5) + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, ptr addrspace(1) %q, i32 %id.x + %d = load i32, ptr addrspace(1) %gep + %cmp = icmp sgt i32 %d, 0 + %p = select i1 %cmp, ptr addrspace(1) %p1, ptr addrspace(1) %p2 + store ptr @cs_callee_not_infer_indirect, ptr addrspace(5) %fn + %fn.cast = addrspacecast ptr addrspace(5) %fn to ptr + tail call fastcc void %fn.cast(ptr addrspace(1) %q, i32 %id.x) + tail call fastcc void %fn.cast(ptr addrspace(1) %p, i32 %id.x) + ret void +} + +define internal fastcc void @callee_with_inreg(ptr addrspace(1) inreg %x, i32 inreg %y) { +; CHECK-LABEL: define {{[^@]+}}@callee_with_inreg +; CHECK-SAME: (ptr addrspace(1) inreg [[X:%.*]], i32 inreg [[Y:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4 +; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g3, align 4 +; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g4, align 4 +; CHECK-NEXT: ret void +; +entry: + %x.val = load i32, ptr addrspace(1) %x, align 4 + store i32 %x.val, ptr addrspace(1) @g3, align 4 + store i32 %y, ptr addrspace(1) @g4, align 4 + ret void +} + +define amdgpu_kernel void @kernel_without_readfirstlane(ptr addrspace(1) %p, i32 %x) { +; CHECK-LABEL: define {{[^@]+}}@kernel_without_readfirstlane +; CHECK-SAME: (ptr addrspace(1) [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: tail call fastcc void @callee_with_inreg(ptr addrspace(1) [[P]], i32 [[X]]) +; CHECK-NEXT: ret void +; +entry: + tail call fastcc void @callee_with_inreg(ptr addrspace(1) %p, i32 %x) + ret void +} + +define amdgpu_kernel void @kernel_with_readfirstlane(ptr addrspace(1) %p, i32 %x) { +; CHECK-LABEL: define {{[^@]+}}@kernel_with_readfirstlane +; CHECK-SAME: (ptr addrspace(1) [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P0:%.*]] = call ptr addrspace(1) @llvm.amdgcn.readfirstlane.p1(ptr addrspace(1) [[P]]) +; CHECK-NEXT: tail call fastcc void @callee_with_inreg(ptr addrspace(1) [[P0]], i32 [[X]]) +; CHECK-NEXT: ret void +; +entry: + %p0 = call ptr addrspace(1) @llvm.amdgcn.readfirstlane.p1(ptr addrspace(1) %p) + tail call fastcc void @callee_with_inreg(ptr addrspace(1) %p0, i32 %x) + ret void +} + +;. +; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll index d58a624..6a67c4e9 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll @@ -8,7 +8,7 @@ define internal fastcc void @foo(ptr %kg) { ; CHECK-LABEL: define internal fastcc void @foo( -; CHECK-SAME: ptr [[KG:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SAME: ptr inreg [[KG:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[CLOSURE_I25_I:%.*]] = getelementptr i8, ptr [[KG]], i64 336 ; CHECK-NEXT: [[NUM_CLOSURE_I26_I:%.*]] = getelementptr i8, ptr [[KG]], i64 276 @@ -80,7 +80,8 @@ define amdgpu_kernel void @kernel() #0 { ; CHECK-NEXT: [[KGLOBALS_ASCAST1:%.*]] = addrspacecast ptr addrspace(5) [[SD]] to ptr ; CHECK-NEXT: [[NUM_CLOSURE_I_I:%.*]] = getelementptr i8, ptr addrspace(5) [[SD]], i32 276 ; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[NUM_CLOSURE_I_I]], align 4 -; CHECK-NEXT: call fastcc void @foo(ptr [[KGLOBALS_ASCAST1]]) +; CHECK-NEXT: [[TMP0:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr [[KGLOBALS_ASCAST1]]) +; CHECK-NEXT: call fastcc void @foo(ptr [[TMP0]]) ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll index ed4e691..f49ca5f 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll @@ -148,14 +148,15 @@ define amdgpu_kernel void @kernel_lds() { define internal i16 @mutual_recursion_0(i16 %arg) { ; CHECK-LABEL: define internal i16 @mutual_recursion_0( -; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-SAME: i16 inreg [[ARG:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() ; CHECK-NEXT: [[RECURSIVE_KERNEL_LDS:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[RECURSIVE_KERNEL_LDS]], align 4 ; CHECK-NEXT: [[RECURSIVE_KERNEL_LDS1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3) ; CHECK-NEXT: [[LD:%.*]] = load i16, ptr addrspace(3) [[RECURSIVE_KERNEL_LDS1]], align 2 ; CHECK-NEXT: [[MUL:%.*]] = mul i16 [[LD]], 7 -; CHECK-NEXT: [[RET:%.*]] = call i16 @mutual_recursion_1(i16 [[LD]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.readfirstlane.i16(i16 [[LD]]) +; CHECK-NEXT: [[RET:%.*]] = call i16 @mutual_recursion_1(i16 [[TMP3]]) ; CHECK-NEXT: [[ADD:%.*]] = add i16 [[RET]], 1 ; CHECK-NEXT: ret i16 [[ADD]] ; @@ -168,7 +169,7 @@ define internal i16 @mutual_recursion_0(i16 %arg) { define internal void @mutual_recursion_1(i16 %arg) { ; CHECK-LABEL: define internal void @mutual_recursion_1( -; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-SAME: i16 inreg [[ARG:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: call void @mutual_recursion_0(i16 [[ARG]]) ; CHECK-NEXT: ret void ; @@ -180,7 +181,8 @@ define amdgpu_kernel void @kernel_lds_recursion() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_lds_recursion( ; CHECK-SAME: ) #[[ATTR5:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META9:![0-9]+]] { ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_lds_recursion.lds) ], !alias.scope [[META10:![0-9]+]], !noalias [[META13:![0-9]+]] -; CHECK-NEXT: call void @mutual_recursion_0(i16 0) +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.amdgcn.readfirstlane.i16(i16 0) +; CHECK-NEXT: call void @mutual_recursion_0(i16 [[TMP1]]) ; CHECK-NEXT: ret void ; call void @mutual_recursion_0(i16 0) @@ -197,8 +199,9 @@ define amdgpu_kernel void @kernel_lds_recursion() { ; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-lds-size"="4" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } -; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. ; CHECK: [[META0]] = !{i32 0, i32 1} ; CHECK: [[META1:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 400} |