aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/DependenceAnalysis.cpp37
-rw-r--r--llvm/lib/BinaryFormat/DXContainer.cpp11
-rw-r--r--llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp9
-rw-r--r--llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp11
-rw-r--r--llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp409
-rw-r--r--llvm/lib/MC/MCObjectStreamer.cpp2
-rw-r--r--llvm/lib/MC/MCSection.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp37
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp18
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp67
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h13
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h2
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp140
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.h3
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp3
-rw-r--r--llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp6
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp30
20 files changed, 777 insertions, 51 deletions
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 256befa..835e270 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -1074,7 +1074,7 @@ bool DependenceInfo::isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X,
/// Compare to see if S is less than Size, using
///
-/// isKnownNegative(S - max(Size, 1))
+/// isKnownNegative(S - Size)
///
/// with some extra checking if S is an AddRec and we can prove less-than using
/// the loop bounds.
@@ -1090,21 +1090,34 @@ bool DependenceInfo::isKnownLessThan(const SCEV *S, const SCEV *Size) const {
Size = SE->getTruncateOrZeroExtend(Size, MaxType);
// Special check for addrecs using BE taken count
- const SCEV *Bound = SE->getMinusSCEV(S, Size);
- if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Bound)) {
- if (AddRec->isAffine()) {
+ if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S))
+ if (AddRec->isAffine() && AddRec->hasNoSignedWrap()) {
const SCEV *BECount = SE->getBackedgeTakenCount(AddRec->getLoop());
- if (!isa<SCEVCouldNotCompute>(BECount)) {
- const SCEV *Limit = AddRec->evaluateAtIteration(BECount, *SE);
- if (SE->isKnownNegative(Limit))
- return true;
- }
+ const SCEV *Start = AddRec->getStart();
+ const SCEV *Step = AddRec->getStepRecurrence(*SE);
+ const SCEV *End = AddRec->evaluateAtIteration(BECount, *SE);
+ const SCEV *Diff0 = SE->getMinusSCEV(Start, Size);
+ const SCEV *Diff1 = SE->getMinusSCEV(End, Size);
+
+ // If the value of Step is non-negative and the AddRec is non-wrap, it
+ // reaches its maximum at the last iteration. So it's enouth to check
+ // whether End - Size is negative.
+ if (SE->isKnownNonNegative(Step) && SE->isKnownNegative(Diff1))
+ return true;
+
+ // If the value of Step is non-positive and the AddRec is non-wrap, the
+ // initial value is its maximum.
+ if (SE->isKnownNonPositive(Step) && SE->isKnownNegative(Diff0))
+ return true;
+
+ // Even if we don't know the sign of Step, either Start or End must be
+ // the maximum value of the AddRec since it is non-wrap.
+ if (SE->isKnownNegative(Diff0) && SE->isKnownNegative(Diff1))
+ return true;
}
- }
// Check using normal isKnownNegative
- const SCEV *LimitedBound =
- SE->getMinusSCEV(S, SE->getSMaxExpr(Size, SE->getOne(Size->getType())));
+ const SCEV *LimitedBound = SE->getMinusSCEV(S, Size);
return SE->isKnownNegative(LimitedBound);
}
diff --git a/llvm/lib/BinaryFormat/DXContainer.cpp b/llvm/lib/BinaryFormat/DXContainer.cpp
index 36d10d0..eb83945 100644
--- a/llvm/lib/BinaryFormat/DXContainer.cpp
+++ b/llvm/lib/BinaryFormat/DXContainer.cpp
@@ -60,6 +60,17 @@ ArrayRef<EnumEntry<SigComponentType>> dxbc::getSigComponentTypes() {
return ArrayRef(SigComponentTypes);
}
+static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = {
+ {"SRV", llvm::dxil::ResourceClass::SRV},
+ {"UAV", llvm::dxil::ResourceClass::UAV},
+ {"CBV", llvm::dxil::ResourceClass::CBuffer},
+ {"Sampler", llvm::dxil::ResourceClass::Sampler},
+};
+
+ArrayRef<EnumEntry<llvm::dxil::ResourceClass>> dxbc::getResourceClasses() {
+ return ArrayRef(ResourceClassNames);
+}
+
static const EnumEntry<RootFlags> RootFlagNames[] = {
#define ROOT_SIGNATURE_FLAG(Val, Enum) {#Enum, RootFlags::Enum},
#include "llvm/BinaryFormat/DXContainerConstants.def"
diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
index 79904fc..574883e 100644
--- a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
+++ b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
@@ -92,16 +92,9 @@ static raw_ostream &operator<<(raw_ostream &OS,
return OS;
}
-static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = {
- {"CBV", dxil::ResourceClass::CBuffer},
- {"SRV", dxil::ResourceClass::SRV},
- {"UAV", dxil::ResourceClass::UAV},
- {"Sampler", dxil::ResourceClass::Sampler},
-};
-
static raw_ostream &operator<<(raw_ostream &OS, const ClauseType &Type) {
OS << enumToStringRef(dxil::ResourceClass(llvm::to_underlying(Type)),
- ArrayRef(ResourceClassNames));
+ dxbc::getResourceClasses());
return OS;
}
diff --git a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
index 9cf4ed1..1cda308 100644
--- a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
+++ b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
@@ -51,13 +51,6 @@ static std::optional<StringRef> extractMdStringValue(MDNode *Node,
return NodeText->getString();
}
-static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = {
- {"CBV", dxil::ResourceClass::CBuffer},
- {"SRV", dxil::ResourceClass::SRV},
- {"UAV", dxil::ResourceClass::UAV},
- {"Sampler", dxil::ResourceClass::Sampler},
-};
-
namespace {
// We use the OverloadVisit with std::visit to ensure the compiler catches if a
@@ -128,7 +121,7 @@ MDNode *MetadataBuilder::BuildRootDescriptor(const RootDescriptor &Descriptor) {
IRBuilder<> Builder(Ctx);
StringRef ResName =
enumToStringRef(dxil::ResourceClass(to_underlying(Descriptor.Type)),
- ArrayRef(ResourceClassNames));
+ dxbc::getResourceClasses());
assert(!ResName.empty() && "Provided an invalid Resource Class");
SmallString<7> Name({"Root", ResName});
Metadata *Operands[] = {
@@ -170,7 +163,7 @@ MDNode *MetadataBuilder::BuildDescriptorTableClause(
IRBuilder<> Builder(Ctx);
StringRef ResName =
enumToStringRef(dxil::ResourceClass(to_underlying(Clause.Type)),
- ArrayRef(ResourceClassNames));
+ dxbc::getResourceClasses());
assert(!ResName.empty() && "Provided an invalid Resource Class");
Metadata *Operands[] = {
MDString::get(Ctx, ResName),
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 260d3c2..ea027e4 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4014,6 +4014,340 @@ OpenMPIRBuilder::createMasked(const LocationDescription &Loc,
/*Conditional*/ true, /*hasFinalize*/ true);
}
+static llvm::CallInst *emitNoUnwindRuntimeCall(IRBuilder<> &Builder,
+ llvm::FunctionCallee Callee,
+ ArrayRef<llvm::Value *> Args,
+ const llvm::Twine &Name) {
+ llvm::CallInst *Call = Builder.CreateCall(
+ Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
+ Call->setDoesNotThrow();
+ return Call;
+}
+
+// Expects input basic block is dominated by BeforeScanBB.
+// Once Scan directive is encountered, the code after scan directive should be
+// dominated by AfterScanBB. Scan directive splits the code sequence to
+// scan and input phase. Based on whether inclusive or exclusive
+// clause is used in the scan directive and whether input loop or scan loop
+// is lowered, it adds jumps to input and scan phase. First Scan loop is the
+// input loop and second is the scan loop. The code generated handles only
+// inclusive scans now.
+OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createScan(
+ const LocationDescription &Loc, InsertPointTy AllocaIP,
+ ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
+ bool IsInclusive, ScanInfo *ScanRedInfo) {
+ if (ScanRedInfo->OMPFirstScanLoop) {
+ llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
+ ScanVarsType, ScanRedInfo);
+ if (Err)
+ return Err;
+ }
+ if (!updateToLocation(Loc))
+ return Loc.IP;
+
+ llvm::Value *IV = ScanRedInfo->IV;
+
+ if (ScanRedInfo->OMPFirstScanLoop) {
+ // Emit buffer[i] = red; at the end of the input phase.
+ for (size_t i = 0; i < ScanVars.size(); i++) {
+ Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
+ Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
+ Type *DestTy = ScanVarsType[i];
+ Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
+ Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
+
+ Builder.CreateStore(Src, Val);
+ }
+ }
+ Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
+ emitBlock(ScanRedInfo->OMPScanDispatch,
+ Builder.GetInsertBlock()->getParent());
+
+ if (!ScanRedInfo->OMPFirstScanLoop) {
+ IV = ScanRedInfo->IV;
+ // Emit red = buffer[i]; at the entrance to the scan phase.
+ // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
+ for (size_t i = 0; i < ScanVars.size(); i++) {
+ Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
+ Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
+ Type *DestTy = ScanVarsType[i];
+ Value *SrcPtr =
+ Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
+ Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
+ Builder.CreateStore(Src, ScanVars[i]);
+ }
+ }
+
+ // TODO: Update it to CreateBr and remove dead blocks
+ llvm::Value *CmpI = Builder.getInt1(true);
+ if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
+ Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
+ ScanRedInfo->OMPAfterScanBlock);
+ } else {
+ Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
+ ScanRedInfo->OMPBeforeScanBlock);
+ }
+ emitBlock(ScanRedInfo->OMPAfterScanBlock,
+ Builder.GetInsertBlock()->getParent());
+ Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
+ return Builder.saveIP();
+}
+
+Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
+ InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
+ ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
+
+ Builder.restoreIP(AllocaIP);
+ // Create the shared pointer at alloca IP.
+ for (size_t i = 0; i < ScanVars.size(); i++) {
+ llvm::Value *BuffPtr =
+ Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
+ (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
+ }
+
+ // Allocate temporary buffer by master thread
+ auto BodyGenCB = [&](InsertPointTy AllocaIP,
+ InsertPointTy CodeGenIP) -> Error {
+ Builder.restoreIP(CodeGenIP);
+ Value *AllocSpan =
+ Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
+ for (size_t i = 0; i < ScanVars.size(); i++) {
+ Type *IntPtrTy = Builder.getInt32Ty();
+ Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
+ Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
+ Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
+ AllocSpan, nullptr, "arr");
+ Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
+ }
+ return Error::success();
+ };
+ // TODO: Perform finalization actions for variables. This has to be
+ // called for variables which have destructors/finalizers.
+ auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
+
+ Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
+ llvm::Value *FilterVal = Builder.getInt32(0);
+ llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
+ createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
+
+ if (!AfterIP)
+ return AfterIP.takeError();
+ Builder.restoreIP(*AfterIP);
+ BasicBlock *InputBB = Builder.GetInsertBlock();
+ if (InputBB->getTerminator())
+ Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
+ AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
+ if (!AfterIP)
+ return AfterIP.takeError();
+ Builder.restoreIP(*AfterIP);
+
+ return Error::success();
+}
+
+Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
+ ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
+ auto BodyGenCB = [&](InsertPointTy AllocaIP,
+ InsertPointTy CodeGenIP) -> Error {
+ Builder.restoreIP(CodeGenIP);
+ for (ReductionInfo RedInfo : ReductionInfos) {
+ Value *PrivateVar = RedInfo.PrivateVariable;
+ Value *OrigVar = RedInfo.Variable;
+ Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
+ Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
+
+ Type *SrcTy = RedInfo.ElementType;
+ Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
+ "arrayOffset");
+ Value *Src = Builder.CreateLoad(SrcTy, Val);
+
+ Builder.CreateStore(Src, OrigVar);
+ Builder.CreateFree(Buff);
+ }
+ return Error::success();
+ };
+ // TODO: Perform finalization actions for variables. This has to be
+ // called for variables which have destructors/finalizers.
+ auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
+
+ if (ScanRedInfo->OMPScanFinish->getTerminator())
+ Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish->getTerminator());
+ else
+ Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
+
+ llvm::Value *FilterVal = Builder.getInt32(0);
+ llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
+ createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
+
+ if (!AfterIP)
+ return AfterIP.takeError();
+ Builder.restoreIP(*AfterIP);
+ BasicBlock *InputBB = Builder.GetInsertBlock();
+ if (InputBB->getTerminator())
+ Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
+ AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
+ if (!AfterIP)
+ return AfterIP.takeError();
+ Builder.restoreIP(*AfterIP);
+ return Error::success();
+}
+
+OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitScanReduction(
+ const LocationDescription &Loc,
+ ArrayRef<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+ ScanInfo *ScanRedInfo) {
+
+ if (!updateToLocation(Loc))
+ return Loc.IP;
+ auto BodyGenCB = [&](InsertPointTy AllocaIP,
+ InsertPointTy CodeGenIP) -> Error {
+ Builder.restoreIP(CodeGenIP);
+ Function *CurFn = Builder.GetInsertBlock()->getParent();
+ // for (int k = 0; k <= ceil(log2(n)); ++k)
+ llvm::BasicBlock *LoopBB =
+ BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
+ llvm::BasicBlock *ExitBB =
+ splitBB(Builder, false, "omp.outer.log.scan.exit");
+ llvm::Function *F = llvm::Intrinsic::getOrInsertDeclaration(
+ Builder.GetInsertBlock()->getModule(),
+ (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
+ llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
+ llvm::Value *Arg =
+ Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
+ llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
+ F = llvm::Intrinsic::getOrInsertDeclaration(
+ Builder.GetInsertBlock()->getModule(),
+ (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
+ LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
+ LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
+ llvm::Value *NMin1 = Builder.CreateNUWSub(
+ ScanRedInfo->Span,
+ llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
+ Builder.SetInsertPoint(InputBB);
+ Builder.CreateBr(LoopBB);
+ emitBlock(LoopBB, CurFn);
+ Builder.SetInsertPoint(LoopBB);
+
+ PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
+ // size pow2k = 1;
+ PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
+ Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
+ InputBB);
+ Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
+ InputBB);
+ // for (size i = n - 1; i >= 2 ^ k; --i)
+ // tmp[i] op= tmp[i-pow2k];
+ llvm::BasicBlock *InnerLoopBB =
+ BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
+ llvm::BasicBlock *InnerExitBB =
+ BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
+ llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
+ Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
+ emitBlock(InnerLoopBB, CurFn);
+ Builder.SetInsertPoint(InnerLoopBB);
+ PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
+ IVal->addIncoming(NMin1, LoopBB);
+ for (ReductionInfo RedInfo : ReductionInfos) {
+ Value *ReductionVal = RedInfo.PrivateVariable;
+ Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
+ Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
+ Type *DestTy = RedInfo.ElementType;
+ Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
+ Value *LHSPtr =
+ Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
+ Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
+ Value *RHSPtr =
+ Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
+ Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
+ Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
+ llvm::Value *Result;
+ InsertPointOrErrorTy AfterIP =
+ RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
+ if (!AfterIP)
+ return AfterIP.takeError();
+ Builder.CreateStore(Result, LHSPtr);
+ }
+ llvm::Value *NextIVal = Builder.CreateNUWSub(
+ IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
+ IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
+ CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
+ Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
+ emitBlock(InnerExitBB, CurFn);
+ llvm::Value *Next = Builder.CreateNUWAdd(
+ Counter, llvm::ConstantInt::get(Counter->getType(), 1));
+ Counter->addIncoming(Next, Builder.GetInsertBlock());
+ // pow2k <<= 1;
+ llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
+ Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
+ llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
+ Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
+ Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
+ return Error::success();
+ };
+
+ // TODO: Perform finalization actions for variables. This has to be
+ // called for variables which have destructors/finalizers.
+ auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
+
+ llvm::Value *FilterVal = Builder.getInt32(0);
+ llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
+ createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
+
+ if (!AfterIP)
+ return AfterIP.takeError();
+ Builder.restoreIP(*AfterIP);
+ AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
+
+ if (!AfterIP)
+ return AfterIP.takeError();
+ Builder.restoreIP(*AfterIP);
+ Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
+ if (Err)
+ return Err;
+
+ return AfterIP;
+}
+
+Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
+ llvm::function_ref<Error()> InputLoopGen,
+ llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
+ ScanInfo *ScanRedInfo) {
+
+ {
+ // Emit loop with input phase:
+ // for (i: 0..<num_iters>) {
+ // <input phase>;
+ // buffer[i] = red;
+ // }
+ ScanRedInfo->OMPFirstScanLoop = true;
+ Error Err = InputLoopGen();
+ if (Err)
+ return Err;
+ }
+ {
+ // Emit loop with scan phase:
+ // for (i: 0..<num_iters>) {
+ // red = buffer[i];
+ // <scan phase>;
+ // }
+ ScanRedInfo->OMPFirstScanLoop = false;
+ Error Err = ScanLoopGen(Builder.saveIP());
+ if (Err)
+ return Err;
+ }
+ return Error::success();
+}
+
+void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
+ Function *Fun = Builder.GetInsertBlock()->getParent();
+ ScanRedInfo->OMPScanDispatch =
+ BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
+ ScanRedInfo->OMPAfterScanBlock =
+ BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
+ ScanRedInfo->OMPBeforeScanBlock =
+ BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
+ ScanRedInfo->OMPScanLoopExit =
+ BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
+}
CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton(
DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
BasicBlock *PostInsertBefore, const Twine &Name) {
@@ -4111,6 +4445,76 @@ OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc,
return CL;
}
+Expected<ScanInfo *> OpenMPIRBuilder::scanInfoInitialize() {
+ ScanInfos.emplace_front();
+ ScanInfo *Result = &ScanInfos.front();
+ return Result;
+}
+
+Expected<SmallVector<llvm::CanonicalLoopInfo *>>
+OpenMPIRBuilder::createCanonicalScanLoops(
+ const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
+ Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
+ InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
+ LocationDescription ComputeLoc =
+ ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
+ updateToLocation(ComputeLoc);
+
+ SmallVector<CanonicalLoopInfo *> Result;
+
+ Value *TripCount = calculateCanonicalLoopTripCount(
+ ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
+ ScanRedInfo->Span = TripCount;
+ ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
+ Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
+
+ auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
+ Builder.restoreIP(CodeGenIP);
+ ScanRedInfo->IV = IV;
+ createScanBBs(ScanRedInfo);
+ BasicBlock *InputBlock = Builder.GetInsertBlock();
+ Instruction *Terminator = InputBlock->getTerminator();
+ assert(Terminator->getNumSuccessors() == 1);
+ BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
+ Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
+ emitBlock(ScanRedInfo->OMPBeforeScanBlock,
+ Builder.GetInsertBlock()->getParent());
+ Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
+ emitBlock(ScanRedInfo->OMPScanLoopExit,
+ Builder.GetInsertBlock()->getParent());
+ Builder.CreateBr(ContinueBlock);
+ Builder.SetInsertPoint(
+ ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
+ return BodyGenCB(Builder.saveIP(), IV);
+ };
+
+ const auto &&InputLoopGen = [&]() -> Error {
+ Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
+ Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
+ ComputeIP, Name, true, ScanRedInfo);
+ if (!LoopInfo)
+ return LoopInfo.takeError();
+ Result.push_back(*LoopInfo);
+ Builder.restoreIP((*LoopInfo)->getAfterIP());
+ return Error::success();
+ };
+ const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
+ Expected<CanonicalLoopInfo *> LoopInfo =
+ createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
+ InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
+ if (!LoopInfo)
+ return LoopInfo.takeError();
+ Result.push_back(*LoopInfo);
+ Builder.restoreIP((*LoopInfo)->getAfterIP());
+ ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
+ return Error::success();
+ };
+ Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
+ if (Err)
+ return Err;
+ return Result;
+}
+
Value *OpenMPIRBuilder::calculateCanonicalLoopTripCount(
const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
bool IsSigned, bool InclusiveStop, const Twine &Name) {
@@ -4174,7 +4578,8 @@ Value *OpenMPIRBuilder::calculateCanonicalLoopTripCount(
Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
- InsertPointTy ComputeIP, const Twine &Name) {
+ InsertPointTy ComputeIP, const Twine &Name, bool InScan,
+ ScanInfo *ScanRedInfo) {
LocationDescription ComputeLoc =
ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
@@ -4185,6 +4590,8 @@ Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
Builder.restoreIP(CodeGenIP);
Value *Span = Builder.CreateMul(IV, Step);
Value *IndVar = Builder.CreateAdd(Span, Start);
+ if (InScan)
+ ScanRedInfo->IV = IndVar;
return BodyGenCB(Builder.saveIP(), IndVar);
};
LocationDescription LoopLoc =
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 8c27958..d0c6144 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -443,7 +443,7 @@ void MCObjectStreamer::emitInstToData(const MCInst &Inst,
// MCAssembler::relaxAlign.
auto *Sec = F->getParent();
if (!Sec->isLinkerRelaxable())
- Sec->setLinkerRelaxable();
+ Sec->setFirstLinkerRelaxable(F->getLayoutOrder());
// Do not add data after a linker-relaxable instruction. The difference
// between a new label and a label at or before the linker-relaxable
// instruction cannot be resolved at assemble-time.
diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp
index 27ca131..9ed6fd1 100644
--- a/llvm/lib/MC/MCSection.cpp
+++ b/llvm/lib/MC/MCSection.cpp
@@ -20,7 +20,7 @@ using namespace llvm;
MCSection::MCSection(StringRef Name, bool IsText, bool IsBss, MCSymbol *Begin)
: Begin(Begin), HasInstructions(false), IsRegistered(false), IsText(IsText),
- IsBss(IsBss), LinkerRelaxable(false), Name(Name) {
+ IsBss(IsBss), Name(Name) {
DummyFragment.setParent(this);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index a6e4a63..40d960e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5879,8 +5879,12 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
const LLT S32 = LLT::scalar(32);
MachineRegisterInfo &MRI = *B.getMRI();
- std::tie(BaseReg, ImmOffset) =
- AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
+ // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
+ // being added, so we can only safely match a 32-bit addition with no unsigned
+ // overflow.
+ bool CheckNUW = AMDGPU::isGFX1250(ST);
+ std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset(
+ MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW);
// If BaseReg is a pointer, convert it to int.
if (MRI.getType(BaseReg).isPointer())
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index ff8efd2..0d2feeb 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -4933,6 +4933,43 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) {
return false;
}
+ // Packed math FP32 instructions typically accept SGPRs or VGPRs as source
+ // operands. On gfx12+, if a source operand uses SGPRs, the HW can only read
+ // the first SGPR and use it for both the low and high operations.
+ if (isPackedFP32Inst(Opc) && isGFX12Plus()) {
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+ int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+ int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi);
+
+ const MCOperand &Src0 = Inst.getOperand(Src0Idx);
+ const MCOperand &Src1 = Inst.getOperand(Src1Idx);
+ unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
+ unsigned OpSelHi = Inst.getOperand(OpSelHiIdx).getImm();
+
+ const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+
+ auto VerifyOneSGPR = [OpSel, OpSelHi](unsigned Index) -> bool {
+ unsigned Mask = 1U << Index;
+ return ((OpSel & Mask) == 0) && ((OpSelHi & Mask) == 0);
+ };
+
+ if (Src0.isReg() && isSGPR(Src0.getReg(), TRI) &&
+ !VerifyOneSGPR(/*Index=*/0))
+ return false;
+ if (Src1.isReg() && isSGPR(Src1.getReg(), TRI) &&
+ !VerifyOneSGPR(/*Index=*/1))
+ return false;
+
+ int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+ if (Src2Idx != -1) {
+ const MCOperand &Src2 = Inst.getOperand(Src2Idx);
+ if (Src2.isReg() && isSGPR(Src2.getReg(), TRI) &&
+ !VerifyOneSGPR(/*Index=*/2))
+ return false;
+ }
+ }
+
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8f44c03..5b327fb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6106,6 +6106,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
case MVT::f64:
return true;
case MVT::f16:
+ case MVT::bf16:
return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
default:
break;
@@ -10877,6 +10878,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}
}
+// Return whether the operation has NoUnsignedWrap property.
+static bool isNoUnsignedWrap(SDValue Addr) {
+ return (Addr.getOpcode() == ISD::ADD &&
+ Addr->getFlags().hasNoUnsignedWrap()) ||
+ Addr->getOpcode() == ISD::OR;
+}
+
bool SITargetLowering::shouldPreservePtrArith(const Function &F,
EVT PtrVT) const {
return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
@@ -10898,8 +10906,14 @@ SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
if ((C1 = dyn_cast<ConstantSDNode>(N0)))
N0 = SDValue();
else if (DAG.isBaseWithConstantOffset(N0)) {
- C1 = cast<ConstantSDNode>(N0.getOperand(1));
- N0 = N0.getOperand(0);
+ // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
+ // being added, so we can only safely match a 32-bit addition with no
+ // unsigned overflow.
+ bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
+ if (!CheckNUW || isNoUnsignedWrap(N0)) {
+ C1 = cast<ConstantSDNode>(N0.getOperand(1));
+ N0 = N0.getOperand(0);
+ }
}
if (C1) {
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index d8fe850..0a68512 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -51,7 +51,7 @@ static cl::opt<unsigned>
namespace {
enum HardClauseType {
- // For GFX10:
+ // For GFX10 and GFX1250:
// Texture, buffer, global or scratch memory instructions.
HARDCLAUSE_VMEM,
@@ -102,7 +102,8 @@ public:
HardClauseType getHardClauseType(const MachineInstr &MI) {
if (MI.mayLoad() || (MI.mayStore() && ST->shouldClusterStores())) {
- if (ST->getGeneration() == AMDGPUSubtarget::GFX10) {
+ if (ST->getGeneration() == AMDGPUSubtarget::GFX10 ||
+ ST->hasGFX1250Insts()) {
if ((SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) ||
SIInstrInfo::isSegmentSpecificFLAT(MI)) {
if (ST->hasNSAClauseBug()) {
@@ -115,7 +116,6 @@ public:
if (SIInstrInfo::isFLAT(MI))
return HARDCLAUSE_FLAT;
} else {
- assert(ST->getGeneration() >= AMDGPUSubtarget::GFX11);
if (SIInstrInfo::isMIMG(MI)) {
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index f20b22d..19e6bcf 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -18,6 +18,7 @@
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/LiveIntervals.h"
@@ -5534,6 +5535,15 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
+ // information.
+ if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
+ for (unsigned I = 0; I < 3; ++I) {
+ if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, I))
+ return false;
+ }
+ }
+
return true;
}
@@ -6005,6 +6015,21 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
unsigned Opc = MI.getOpcode();
+ // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
+ // information.
+ if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
+ MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
+ constexpr const AMDGPU::OpName OpNames[] = {
+ AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
+
+ for (auto [I, OpName] : enumerate(OpNames)) {
+ int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
+ if (static_cast<unsigned>(SrcIdx) == OpIdx &&
+ !isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, I, &MO))
+ return false;
+ }
+ }
+
if (!isLegalRegOperand(MRI, OpInfo, MO))
return false;
@@ -6053,6 +6078,39 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
return true;
}
+bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand(
+ const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
+ const MachineOperand *MO) const {
+ constexpr const unsigned NumOps = 3;
+ constexpr const AMDGPU::OpName OpNames[NumOps * 2] = {
+ AMDGPU::OpName::src0, AMDGPU::OpName::src1,
+ AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
+ AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
+
+ assert(SrcN < NumOps);
+
+ if (!MO) {
+ int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
+ if (SrcIdx == -1)
+ return true;
+ MO = &MI.getOperand(SrcIdx);
+ }
+
+ if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
+ return true;
+
+ int ModsIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
+ if (ModsIdx == -1)
+ return true;
+
+ unsigned Mods = MI.getOperand(ModsIdx).getImm();
+ bool OpSel = Mods & SISrcMods::OP_SEL_0;
+ bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
+
+ return !OpSel && !OpSelHi;
+}
+
bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
const MachineOperand *MO) const {
const MachineFunction &MF = *MI.getParent()->getParent();
@@ -6390,6 +6448,15 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
!RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
legalizeOpWithMove(MI, VOP3Idx[2]);
+
+ // Fix the register class of packed FP32 instructions on gfx12+. See
+ // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
+ if (AMDGPU::isPackedFP32Inst(Opc) && AMDGPU::isGFX12Plus(ST)) {
+ for (unsigned I = 0; I < 3; ++I) {
+ if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, /*SrcN=*/I))
+ legalizeOpWithMove(MI, VOP3Idx[I]);
+ }
+ }
}
Register SIInstrInfo::readlaneVGPRToSGPR(
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e042b59..6b9403f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1287,6 +1287,19 @@ public:
const MachineOperand &MO) const;
bool isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
const MachineOperand &MO) const;
+
+ /// Check if \p MO would be a legal operand for gfx12+ packed math FP32
+ /// instructions. Packed math FP32 instructions typically accept SGPRs or
+ /// VGPRs as source operands. On gfx12+, if a source operand uses SGPRs, the
+ /// HW can only read the first SGPR and use it for both the low and high
+ /// operations.
+ /// \p SrcN can be 0, 1, or 2, representing src0, src1, and src2,
+ /// respectively. If \p MO is nullptr, the operand corresponding to SrcN will
+ /// be used.
+ bool isLegalGFX12PlusPackedMathFP32Operand(
+ const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
+ const MachineOperand *MO = nullptr) const;
+
/// Legalize operands in \p MI by either commuting it or inserting a
/// copy of src1.
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 00dcb9b..1e3e9a2 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -3318,6 +3318,20 @@ unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) {
return 128;
}
+bool isPackedFP32Inst(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::V_PK_ADD_F32:
+ case AMDGPU::V_PK_ADD_F32_gfx12:
+ case AMDGPU::V_PK_MUL_F32:
+ case AMDGPU::V_PK_MUL_F32_gfx12:
+ case AMDGPU::V_PK_FMA_F32:
+ case AMDGPU::V_PK_FMA_F32_gfx12:
+ return true;
+ default:
+ return false;
+ }
+}
+
} // namespace AMDGPU
raw_ostream &operator<<(raw_ostream &OS,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 1252e35..1bcd36c 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1709,6 +1709,8 @@ bool isArgPassedInSGPR(const Argument *Arg);
bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo);
+LLVM_READONLY bool isPackedFP32Inst(unsigned Opc);
+
LLVM_READONLY
bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
int64_t EncodedOffset);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index ea99cc4..75d3cfa 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -802,6 +802,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
setOperationAction(ISD::BSWAP, VT, Expand);
}
+ if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps())
+ setOperationAction(ISD::SCMP, MVT::i32, Custom);
+
+ if (!Subtarget->hasV8_1MMainlineOps())
+ setOperationAction(ISD::UCMP, MVT::i32, Custom);
+
setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
@@ -1634,6 +1640,10 @@ bool ARMTargetLowering::useSoftFloat() const {
return Subtarget->useSoftFloat();
}
+bool ARMTargetLowering::shouldExpandCmpUsingSelects(EVT VT) const {
+ return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32;
+}
+
// FIXME: It might make sense to define the representative register class as the
// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
@@ -10612,6 +10622,133 @@ SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op,
return DAG.getBitcast(MVT::i32, Res);
}
+SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+
+ // Determine if this is signed or unsigned comparison
+ bool IsSigned = (Op.getOpcode() == ISD::SCMP);
+
+ // Special case for Thumb1 UCMP only
+ if (!IsSigned && Subtarget->isThumb1Only()) {
+ // For Thumb unsigned comparison, use this sequence:
+ // subs r2, r0, r1 ; r2 = LHS - RHS, sets flags
+ // sbc r2, r2 ; r2 = r2 - r2 - !carry
+ // cmp r1, r0 ; compare RHS with LHS
+ // sbc r1, r1 ; r1 = r1 - r1 - !carry
+ // subs r0, r2, r1 ; r0 = r2 - r1 (final result)
+
+ // First subtraction: LHS - RHS
+ SDValue Sub1WithFlags = DAG.getNode(
+ ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
+ SDValue Sub1Result = Sub1WithFlags.getValue(0);
+ SDValue Flags1 = Sub1WithFlags.getValue(1);
+
+ // SUBE: Sub1Result - Sub1Result - !carry
+ // This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned)
+ SDValue Sbc1 =
+ DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT),
+ Sub1Result, Sub1Result, Flags1);
+ SDValue Sbc1Result = Sbc1.getValue(0);
+
+ // Second comparison: RHS vs LHS (reverse comparison)
+ SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS);
+
+ // SUBE: RHS - RHS - !carry
+ // This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned)
+ SDValue Sbc2 = DAG.getNode(
+ ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags);
+ SDValue Sbc2Result = Sbc2.getValue(0);
+
+ // Final subtraction: Sbc1Result - Sbc2Result (no flags needed)
+ SDValue Result =
+ DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result);
+ if (Op.getValueType() != MVT::i32)
+ Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
+
+ return Result;
+ }
+
+ // For the ARM assembly pattern:
+ // subs r0, r0, r1 ; subtract RHS from LHS and set flags
+ // movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for
+ // unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for
+ // signed, LO for unsigned)
+ // ; if LHS == RHS, result remains 0 from the subs
+
+ // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC
+ unsigned Opcode = ARMISD::SUBC;
+
+ // Check if RHS is a subtraction against 0: (0 - X)
+ if (RHS.getOpcode() == ISD::SUB) {
+ SDValue SubLHS = RHS.getOperand(0);
+ SDValue SubRHS = RHS.getOperand(1);
+
+ // Check if it's 0 - X
+ if (isNullConstant(SubLHS)) {
+ bool CanUseAdd = false;
+ if (IsSigned) {
+ // For SCMP: only if X is known to never be INT_MIN (to avoid overflow)
+ if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS)
+ .getSignedMinValue()
+ .isMinSignedValue()) {
+ CanUseAdd = true;
+ }
+ } else {
+ // For UCMP: only if X is known to never be zero
+ if (DAG.isKnownNeverZero(SubRHS)) {
+ CanUseAdd = true;
+ }
+ }
+
+ if (CanUseAdd) {
+ Opcode = ARMISD::ADDC;
+ RHS = SubRHS; // Replace RHS with X, so we do LHS + X instead of
+ // LHS - (0 - X)
+ }
+ }
+ }
+
+ // Generate the operation with flags
+ SDValue OpWithFlags;
+ if (Opcode == ARMISD::ADDC) {
+ // Use ADDC: LHS + RHS (where RHS was 0 - X, now X)
+ OpWithFlags = DAG.getNode(ARMISD::ADDC, dl,
+ DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
+ } else {
+ // Use ARMISD::SUBC to generate SUBS instruction (subtract with flags)
+ OpWithFlags = DAG.getNode(ARMISD::SUBC, dl,
+ DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
+ }
+
+ SDValue OpResult = OpWithFlags.getValue(0); // The operation result
+ SDValue Flags = OpWithFlags.getValue(1); // The flags
+
+ // Constants for conditional moves
+ SDValue One = DAG.getConstant(1, dl, MVT::i32);
+ SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32);
+
+ // Select condition codes based on signed vs unsigned
+ ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI;
+ ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO;
+
+ // First conditional move: if greater than, set to 1
+ SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32);
+ SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One,
+ GTCondValue, Flags);
+
+ // Second conditional move: if less than, set to -1
+ SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32);
+ SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne,
+ LTCondValue, Flags);
+
+ if (Op.getValueType() != MVT::i32)
+ Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
+
+ return Result2;
+}
+
SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
switch (Op.getOpcode()) {
@@ -10740,6 +10877,9 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FP_TO_BF16:
return LowerFP_TO_BF16(Op, DAG);
case ARMISD::WIN__DBZCHK: return SDValue();
+ case ISD::UCMP:
+ case ISD::SCMP:
+ return LowerCMP(Op, DAG);
}
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 825145d..a84a3cb 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -607,6 +607,8 @@ class VectorType;
bool preferZeroCompareBranch() const override { return true; }
+ bool shouldExpandCmpUsingSelects(EVT VT) const override;
+
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
bool hasAndNotCompare(SDValue V) const override {
@@ -904,6 +906,7 @@ class VectorType;
void LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const;
SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCMP(SDValue Op, SelectionDAG &DAG) const;
Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
index fda9d97..ca5d27d 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -254,7 +254,8 @@ bool LoongArchAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) {
MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN);
F.setVarFixups({Fixup});
F.setLinkerRelaxable();
- F.getParent()->setLinkerRelaxable();
+ if (!F.getParent()->isLinkerRelaxable())
+ F.getParent()->setFirstLinkerRelaxable(F.getLayoutOrder());
return true;
}
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 67cc01e..e0ac591 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -674,6 +674,9 @@ static constexpr FeatureBitset XAndesGroup = {
static constexpr DecoderListEntry DecoderList32[]{
// Vendor Extensions
+ {DecoderTableXCV32, XCVFeatureGroup, "CORE-V extensions"},
+ {DecoderTableXRivos32, XRivosFeatureGroup, "Rivos"},
+ {DecoderTableXqci32, XqciFeatureGroup, "Qualcomm uC Extensions"},
{DecoderTableXVentana32,
{RISCV::FeatureVendorXVentanaCondOps},
"XVentanaCondOps"},
@@ -690,9 +693,6 @@ static constexpr DecoderListEntry DecoderList32[]{
"MIPS mips.pref"},
{DecoderTableXAndes32, XAndesGroup, "Andes extensions"},
// Standard Extensions
- {DecoderTableXCV32, XCVFeatureGroup, "CORE-V extensions"},
- {DecoderTableXqci32, XqciFeatureGroup, "Qualcomm uC Extensions"},
- {DecoderTableXRivos32, XRivosFeatureGroup, "Rivos"},
{DecoderTable32, {}, "standard 32-bit instructions"},
{DecoderTableRV32Only32, {}, "RV32-only standard 32-bit instructions"},
{DecoderTableZfinx32, {}, "Zfinx (Float in Integer)"},
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index a997ea5..8d956ce 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -32,6 +32,11 @@ static cl::opt<bool> ULEB128Reloc(
"riscv-uleb128-reloc", cl::init(true), cl::Hidden,
cl::desc("Emit R_RISCV_SET_ULEB128/E_RISCV_SUB_ULEB128 if appropriate"));
+static cl::opt<bool>
+ AlignRvc("riscv-align-rvc", cl::init(true), cl::Hidden,
+ cl::desc("When generating R_RISCV_ALIGN, insert $alignment-2 "
+ "bytes of NOPs even in norvc code"));
+
RISCVAsmBackend::RISCVAsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI,
bool Is64Bit, const MCTargetOptions &Options)
: MCAsmBackend(llvm::endianness::little), STI(STI), OSABI(OSABI),
@@ -306,12 +311,21 @@ void RISCVAsmBackend::relaxInstruction(MCInst &Inst,
// If conditions are met, compute the padding size and create a fixup encoding
// the padding size in the addend.
bool RISCVAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) {
- // Use default handling unless linker relaxation is enabled and the alignment
- // is larger than the nop size.
- const MCSubtargetInfo *STI = F.getSubtargetInfo();
- if (!STI->hasFeature(RISCV::FeatureRelax))
+ // Alignments before the first linker-relaxable instruction have fixed sizes
+ // and do not require relocations. Alignments after a linker-relaxable
+ // instruction require a relocation, even if the STI specifies norelax.
+ //
+ // firstLinkerRelaxable is the layout order within the subsection, which may
+ // be smaller than the section's order. Therefore, alignments in a
+ // lower-numbered subsection may be unnecessarily treated as linker-relaxable.
+ auto *Sec = F.getParent();
+ if (F.getLayoutOrder() <= Sec->firstLinkerRelaxable())
return false;
- unsigned MinNopLen = STI->hasFeature(RISCV::FeatureStdExtZca) ? 2 : 4;
+
+ // Use default handling unless the alignment is larger than the nop size.
+ const MCSubtargetInfo *STI = F.getSubtargetInfo();
+ unsigned MinNopLen =
+ AlignRvc || STI->hasFeature(RISCV::FeatureStdExtZca) ? 2 : 4;
if (F.getAlignment() <= MinNopLen)
return false;
@@ -321,7 +335,6 @@ bool RISCVAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) {
MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_RISCV_ALIGN);
F.setVarFixups({Fixup});
F.setLinkerRelaxable();
- F.getParent()->setLinkerRelaxable();
return true;
}
@@ -474,8 +487,9 @@ bool RISCVAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
// TODO: emit a mapping symbol right here
if (Count % 4 == 2) {
- // The canonical nop with Zca is c.nop.
- OS.write(STI->hasFeature(RISCV::FeatureStdExtZca) ? "\x01\0" : "\0\0", 2);
+ // The canonical nop with Zca is c.nop. For .balign 4, we generate a 2-byte
+ // c.nop even in a norvc region.
+ OS.write("\x01\0", 2);
Count -= 2;
}