diff options
Diffstat (limited to 'llvm/lib')
20 files changed, 777 insertions, 51 deletions
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index 256befa..835e270 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -1074,7 +1074,7 @@ bool DependenceInfo::isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X, /// Compare to see if S is less than Size, using /// -/// isKnownNegative(S - max(Size, 1)) +/// isKnownNegative(S - Size) /// /// with some extra checking if S is an AddRec and we can prove less-than using /// the loop bounds. @@ -1090,21 +1090,34 @@ bool DependenceInfo::isKnownLessThan(const SCEV *S, const SCEV *Size) const { Size = SE->getTruncateOrZeroExtend(Size, MaxType); // Special check for addrecs using BE taken count - const SCEV *Bound = SE->getMinusSCEV(S, Size); - if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Bound)) { - if (AddRec->isAffine()) { + if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S)) + if (AddRec->isAffine() && AddRec->hasNoSignedWrap()) { const SCEV *BECount = SE->getBackedgeTakenCount(AddRec->getLoop()); - if (!isa<SCEVCouldNotCompute>(BECount)) { - const SCEV *Limit = AddRec->evaluateAtIteration(BECount, *SE); - if (SE->isKnownNegative(Limit)) - return true; - } + const SCEV *Start = AddRec->getStart(); + const SCEV *Step = AddRec->getStepRecurrence(*SE); + const SCEV *End = AddRec->evaluateAtIteration(BECount, *SE); + const SCEV *Diff0 = SE->getMinusSCEV(Start, Size); + const SCEV *Diff1 = SE->getMinusSCEV(End, Size); + + // If the value of Step is non-negative and the AddRec is non-wrap, it + // reaches its maximum at the last iteration. So it's enouth to check + // whether End - Size is negative. + if (SE->isKnownNonNegative(Step) && SE->isKnownNegative(Diff1)) + return true; + + // If the value of Step is non-positive and the AddRec is non-wrap, the + // initial value is its maximum. + if (SE->isKnownNonPositive(Step) && SE->isKnownNegative(Diff0)) + return true; + + // Even if we don't know the sign of Step, either Start or End must be + // the maximum value of the AddRec since it is non-wrap. + if (SE->isKnownNegative(Diff0) && SE->isKnownNegative(Diff1)) + return true; } - } // Check using normal isKnownNegative - const SCEV *LimitedBound = - SE->getMinusSCEV(S, SE->getSMaxExpr(Size, SE->getOne(Size->getType()))); + const SCEV *LimitedBound = SE->getMinusSCEV(S, Size); return SE->isKnownNegative(LimitedBound); } diff --git a/llvm/lib/BinaryFormat/DXContainer.cpp b/llvm/lib/BinaryFormat/DXContainer.cpp index 36d10d0..eb83945 100644 --- a/llvm/lib/BinaryFormat/DXContainer.cpp +++ b/llvm/lib/BinaryFormat/DXContainer.cpp @@ -60,6 +60,17 @@ ArrayRef<EnumEntry<SigComponentType>> dxbc::getSigComponentTypes() { return ArrayRef(SigComponentTypes); } +static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = { + {"SRV", llvm::dxil::ResourceClass::SRV}, + {"UAV", llvm::dxil::ResourceClass::UAV}, + {"CBV", llvm::dxil::ResourceClass::CBuffer}, + {"Sampler", llvm::dxil::ResourceClass::Sampler}, +}; + +ArrayRef<EnumEntry<llvm::dxil::ResourceClass>> dxbc::getResourceClasses() { + return ArrayRef(ResourceClassNames); +} + static const EnumEntry<RootFlags> RootFlagNames[] = { #define ROOT_SIGNATURE_FLAG(Val, Enum) {#Enum, RootFlags::Enum}, #include "llvm/BinaryFormat/DXContainerConstants.def" diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp index 79904fc..574883e 100644 --- a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp +++ b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp @@ -92,16 +92,9 @@ static raw_ostream &operator<<(raw_ostream &OS, return OS; } -static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = { - {"CBV", dxil::ResourceClass::CBuffer}, - {"SRV", dxil::ResourceClass::SRV}, - {"UAV", dxil::ResourceClass::UAV}, - {"Sampler", dxil::ResourceClass::Sampler}, -}; - static raw_ostream &operator<<(raw_ostream &OS, const ClauseType &Type) { OS << enumToStringRef(dxil::ResourceClass(llvm::to_underlying(Type)), - ArrayRef(ResourceClassNames)); + dxbc::getResourceClasses()); return OS; } diff --git a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp index 9cf4ed1..1cda308 100644 --- a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp +++ b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp @@ -51,13 +51,6 @@ static std::optional<StringRef> extractMdStringValue(MDNode *Node, return NodeText->getString(); } -static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = { - {"CBV", dxil::ResourceClass::CBuffer}, - {"SRV", dxil::ResourceClass::SRV}, - {"UAV", dxil::ResourceClass::UAV}, - {"Sampler", dxil::ResourceClass::Sampler}, -}; - namespace { // We use the OverloadVisit with std::visit to ensure the compiler catches if a @@ -128,7 +121,7 @@ MDNode *MetadataBuilder::BuildRootDescriptor(const RootDescriptor &Descriptor) { IRBuilder<> Builder(Ctx); StringRef ResName = enumToStringRef(dxil::ResourceClass(to_underlying(Descriptor.Type)), - ArrayRef(ResourceClassNames)); + dxbc::getResourceClasses()); assert(!ResName.empty() && "Provided an invalid Resource Class"); SmallString<7> Name({"Root", ResName}); Metadata *Operands[] = { @@ -170,7 +163,7 @@ MDNode *MetadataBuilder::BuildDescriptorTableClause( IRBuilder<> Builder(Ctx); StringRef ResName = enumToStringRef(dxil::ResourceClass(to_underlying(Clause.Type)), - ArrayRef(ResourceClassNames)); + dxbc::getResourceClasses()); assert(!ResName.empty() && "Provided an invalid Resource Class"); Metadata *Operands[] = { MDString::get(Ctx, ResName), diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 260d3c2..ea027e4 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -4014,6 +4014,340 @@ OpenMPIRBuilder::createMasked(const LocationDescription &Loc, /*Conditional*/ true, /*hasFinalize*/ true); } +static llvm::CallInst *emitNoUnwindRuntimeCall(IRBuilder<> &Builder, + llvm::FunctionCallee Callee, + ArrayRef<llvm::Value *> Args, + const llvm::Twine &Name) { + llvm::CallInst *Call = Builder.CreateCall( + Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name); + Call->setDoesNotThrow(); + return Call; +} + +// Expects input basic block is dominated by BeforeScanBB. +// Once Scan directive is encountered, the code after scan directive should be +// dominated by AfterScanBB. Scan directive splits the code sequence to +// scan and input phase. Based on whether inclusive or exclusive +// clause is used in the scan directive and whether input loop or scan loop +// is lowered, it adds jumps to input and scan phase. First Scan loop is the +// input loop and second is the scan loop. The code generated handles only +// inclusive scans now. +OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createScan( + const LocationDescription &Loc, InsertPointTy AllocaIP, + ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType, + bool IsInclusive, ScanInfo *ScanRedInfo) { + if (ScanRedInfo->OMPFirstScanLoop) { + llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars, + ScanVarsType, ScanRedInfo); + if (Err) + return Err; + } + if (!updateToLocation(Loc)) + return Loc.IP; + + llvm::Value *IV = ScanRedInfo->IV; + + if (ScanRedInfo->OMPFirstScanLoop) { + // Emit buffer[i] = red; at the end of the input phase. + for (size_t i = 0; i < ScanVars.size(); i++) { + Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]; + Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr); + Type *DestTy = ScanVarsType[i]; + Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset"); + Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]); + + Builder.CreateStore(Src, Val); + } + } + Builder.CreateBr(ScanRedInfo->OMPScanLoopExit); + emitBlock(ScanRedInfo->OMPScanDispatch, + Builder.GetInsertBlock()->getParent()); + + if (!ScanRedInfo->OMPFirstScanLoop) { + IV = ScanRedInfo->IV; + // Emit red = buffer[i]; at the entrance to the scan phase. + // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated. + for (size_t i = 0; i < ScanVars.size(); i++) { + Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]; + Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr); + Type *DestTy = ScanVarsType[i]; + Value *SrcPtr = + Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset"); + Value *Src = Builder.CreateLoad(DestTy, SrcPtr); + Builder.CreateStore(Src, ScanVars[i]); + } + } + + // TODO: Update it to CreateBr and remove dead blocks + llvm::Value *CmpI = Builder.getInt1(true); + if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) { + Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock, + ScanRedInfo->OMPAfterScanBlock); + } else { + Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock, + ScanRedInfo->OMPBeforeScanBlock); + } + emitBlock(ScanRedInfo->OMPAfterScanBlock, + Builder.GetInsertBlock()->getParent()); + Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock); + return Builder.saveIP(); +} + +Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR( + InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars, + ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) { + + Builder.restoreIP(AllocaIP); + // Create the shared pointer at alloca IP. + for (size_t i = 0; i < ScanVars.size(); i++) { + llvm::Value *BuffPtr = + Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla"); + (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr; + } + + // Allocate temporary buffer by master thread + auto BodyGenCB = [&](InsertPointTy AllocaIP, + InsertPointTy CodeGenIP) -> Error { + Builder.restoreIP(CodeGenIP); + Value *AllocSpan = + Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1)); + for (size_t i = 0; i < ScanVars.size(); i++) { + Type *IntPtrTy = Builder.getInt32Ty(); + Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]); + Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy); + Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize, + AllocSpan, nullptr, "arr"); + Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]); + } + return Error::success(); + }; + // TODO: Perform finalization actions for variables. This has to be + // called for variables which have destructors/finalizers. + auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); }; + + Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator()); + llvm::Value *FilterVal = Builder.getInt32(0); + llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = + createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal); + + if (!AfterIP) + return AfterIP.takeError(); + Builder.restoreIP(*AfterIP); + BasicBlock *InputBB = Builder.GetInsertBlock(); + if (InputBB->getTerminator()) + Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator()); + AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier); + if (!AfterIP) + return AfterIP.takeError(); + Builder.restoreIP(*AfterIP); + + return Error::success(); +} + +Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR( + ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) { + auto BodyGenCB = [&](InsertPointTy AllocaIP, + InsertPointTy CodeGenIP) -> Error { + Builder.restoreIP(CodeGenIP); + for (ReductionInfo RedInfo : ReductionInfos) { + Value *PrivateVar = RedInfo.PrivateVariable; + Value *OrigVar = RedInfo.Variable; + Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar]; + Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr); + + Type *SrcTy = RedInfo.ElementType; + Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span, + "arrayOffset"); + Value *Src = Builder.CreateLoad(SrcTy, Val); + + Builder.CreateStore(Src, OrigVar); + Builder.CreateFree(Buff); + } + return Error::success(); + }; + // TODO: Perform finalization actions for variables. This has to be + // called for variables which have destructors/finalizers. + auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); }; + + if (ScanRedInfo->OMPScanFinish->getTerminator()) + Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish->getTerminator()); + else + Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish); + + llvm::Value *FilterVal = Builder.getInt32(0); + llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = + createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal); + + if (!AfterIP) + return AfterIP.takeError(); + Builder.restoreIP(*AfterIP); + BasicBlock *InputBB = Builder.GetInsertBlock(); + if (InputBB->getTerminator()) + Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator()); + AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier); + if (!AfterIP) + return AfterIP.takeError(); + Builder.restoreIP(*AfterIP); + return Error::success(); +} + +OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitScanReduction( + const LocationDescription &Loc, + ArrayRef<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos, + ScanInfo *ScanRedInfo) { + + if (!updateToLocation(Loc)) + return Loc.IP; + auto BodyGenCB = [&](InsertPointTy AllocaIP, + InsertPointTy CodeGenIP) -> Error { + Builder.restoreIP(CodeGenIP); + Function *CurFn = Builder.GetInsertBlock()->getParent(); + // for (int k = 0; k <= ceil(log2(n)); ++k) + llvm::BasicBlock *LoopBB = + BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body"); + llvm::BasicBlock *ExitBB = + splitBB(Builder, false, "omp.outer.log.scan.exit"); + llvm::Function *F = llvm::Intrinsic::getOrInsertDeclaration( + Builder.GetInsertBlock()->getModule(), + (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy()); + llvm::BasicBlock *InputBB = Builder.GetInsertBlock(); + llvm::Value *Arg = + Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy()); + llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, ""); + F = llvm::Intrinsic::getOrInsertDeclaration( + Builder.GetInsertBlock()->getModule(), + (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy()); + LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, ""); + LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty()); + llvm::Value *NMin1 = Builder.CreateNUWSub( + ScanRedInfo->Span, + llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1)); + Builder.SetInsertPoint(InputBB); + Builder.CreateBr(LoopBB); + emitBlock(LoopBB, CurFn); + Builder.SetInsertPoint(LoopBB); + + PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2); + // size pow2k = 1; + PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2); + Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0), + InputBB); + Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1), + InputBB); + // for (size i = n - 1; i >= 2 ^ k; --i) + // tmp[i] op= tmp[i-pow2k]; + llvm::BasicBlock *InnerLoopBB = + BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body"); + llvm::BasicBlock *InnerExitBB = + BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit"); + llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K); + Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB); + emitBlock(InnerLoopBB, CurFn); + Builder.SetInsertPoint(InnerLoopBB); + PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2); + IVal->addIncoming(NMin1, LoopBB); + for (ReductionInfo RedInfo : ReductionInfos) { + Value *ReductionVal = RedInfo.PrivateVariable; + Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal]; + Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr); + Type *DestTy = RedInfo.ElementType; + Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1)); + Value *LHSPtr = + Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset"); + Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K); + Value *RHSPtr = + Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset"); + Value *LHS = Builder.CreateLoad(DestTy, LHSPtr); + Value *RHS = Builder.CreateLoad(DestTy, RHSPtr); + llvm::Value *Result; + InsertPointOrErrorTy AfterIP = + RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result); + if (!AfterIP) + return AfterIP.takeError(); + Builder.CreateStore(Result, LHSPtr); + } + llvm::Value *NextIVal = Builder.CreateNUWSub( + IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1)); + IVal->addIncoming(NextIVal, Builder.GetInsertBlock()); + CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K); + Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB); + emitBlock(InnerExitBB, CurFn); + llvm::Value *Next = Builder.CreateNUWAdd( + Counter, llvm::ConstantInt::get(Counter->getType(), 1)); + Counter->addIncoming(Next, Builder.GetInsertBlock()); + // pow2k <<= 1; + llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true); + Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock()); + llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal); + Builder.CreateCondBr(Cmp, LoopBB, ExitBB); + Builder.SetInsertPoint(ExitBB->getFirstInsertionPt()); + return Error::success(); + }; + + // TODO: Perform finalization actions for variables. This has to be + // called for variables which have destructors/finalizers. + auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); }; + + llvm::Value *FilterVal = Builder.getInt32(0); + llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = + createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal); + + if (!AfterIP) + return AfterIP.takeError(); + Builder.restoreIP(*AfterIP); + AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier); + + if (!AfterIP) + return AfterIP.takeError(); + Builder.restoreIP(*AfterIP); + Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo); + if (Err) + return Err; + + return AfterIP; +} + +Error OpenMPIRBuilder::emitScanBasedDirectiveIR( + llvm::function_ref<Error()> InputLoopGen, + llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen, + ScanInfo *ScanRedInfo) { + + { + // Emit loop with input phase: + // for (i: 0..<num_iters>) { + // <input phase>; + // buffer[i] = red; + // } + ScanRedInfo->OMPFirstScanLoop = true; + Error Err = InputLoopGen(); + if (Err) + return Err; + } + { + // Emit loop with scan phase: + // for (i: 0..<num_iters>) { + // red = buffer[i]; + // <scan phase>; + // } + ScanRedInfo->OMPFirstScanLoop = false; + Error Err = ScanLoopGen(Builder.saveIP()); + if (Err) + return Err; + } + return Error::success(); +} + +void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) { + Function *Fun = Builder.GetInsertBlock()->getParent(); + ScanRedInfo->OMPScanDispatch = + BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch"); + ScanRedInfo->OMPAfterScanBlock = + BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb"); + ScanRedInfo->OMPBeforeScanBlock = + BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb"); + ScanRedInfo->OMPScanLoopExit = + BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit"); +} CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton( DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name) { @@ -4111,6 +4445,76 @@ OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc, return CL; } +Expected<ScanInfo *> OpenMPIRBuilder::scanInfoInitialize() { + ScanInfos.emplace_front(); + ScanInfo *Result = &ScanInfos.front(); + return Result; +} + +Expected<SmallVector<llvm::CanonicalLoopInfo *>> +OpenMPIRBuilder::createCanonicalScanLoops( + const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, + Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, + InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) { + LocationDescription ComputeLoc = + ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc; + updateToLocation(ComputeLoc); + + SmallVector<CanonicalLoopInfo *> Result; + + Value *TripCount = calculateCanonicalLoopTripCount( + ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name); + ScanRedInfo->Span = TripCount; + ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init"); + Builder.SetInsertPoint(ScanRedInfo->OMPScanInit); + + auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) { + Builder.restoreIP(CodeGenIP); + ScanRedInfo->IV = IV; + createScanBBs(ScanRedInfo); + BasicBlock *InputBlock = Builder.GetInsertBlock(); + Instruction *Terminator = InputBlock->getTerminator(); + assert(Terminator->getNumSuccessors() == 1); + BasicBlock *ContinueBlock = Terminator->getSuccessor(0); + Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch); + emitBlock(ScanRedInfo->OMPBeforeScanBlock, + Builder.GetInsertBlock()->getParent()); + Builder.CreateBr(ScanRedInfo->OMPScanLoopExit); + emitBlock(ScanRedInfo->OMPScanLoopExit, + Builder.GetInsertBlock()->getParent()); + Builder.CreateBr(ContinueBlock); + Builder.SetInsertPoint( + ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt()); + return BodyGenCB(Builder.saveIP(), IV); + }; + + const auto &&InputLoopGen = [&]() -> Error { + Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop( + Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop, + ComputeIP, Name, true, ScanRedInfo); + if (!LoopInfo) + return LoopInfo.takeError(); + Result.push_back(*LoopInfo); + Builder.restoreIP((*LoopInfo)->getAfterIP()); + return Error::success(); + }; + const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error { + Expected<CanonicalLoopInfo *> LoopInfo = + createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned, + InclusiveStop, ComputeIP, Name, true, ScanRedInfo); + if (!LoopInfo) + return LoopInfo.takeError(); + Result.push_back(*LoopInfo); + Builder.restoreIP((*LoopInfo)->getAfterIP()); + ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock(); + return Error::success(); + }; + Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo); + if (Err) + return Err; + return Result; +} + Value *OpenMPIRBuilder::calculateCanonicalLoopTripCount( const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, const Twine &Name) { @@ -4174,7 +4578,8 @@ Value *OpenMPIRBuilder::calculateCanonicalLoopTripCount( Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop( const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, - InsertPointTy ComputeIP, const Twine &Name) { + InsertPointTy ComputeIP, const Twine &Name, bool InScan, + ScanInfo *ScanRedInfo) { LocationDescription ComputeLoc = ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc; @@ -4185,6 +4590,8 @@ Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop( Builder.restoreIP(CodeGenIP); Value *Span = Builder.CreateMul(IV, Step); Value *IndVar = Builder.CreateAdd(Span, Start); + if (InScan) + ScanRedInfo->IV = IndVar; return BodyGenCB(Builder.saveIP(), IndVar); }; LocationDescription LoopLoc = diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index 8c27958..d0c6144 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -443,7 +443,7 @@ void MCObjectStreamer::emitInstToData(const MCInst &Inst, // MCAssembler::relaxAlign. auto *Sec = F->getParent(); if (!Sec->isLinkerRelaxable()) - Sec->setLinkerRelaxable(); + Sec->setFirstLinkerRelaxable(F->getLayoutOrder()); // Do not add data after a linker-relaxable instruction. The difference // between a new label and a label at or before the linker-relaxable // instruction cannot be resolved at assemble-time. diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp index 27ca131..9ed6fd1 100644 --- a/llvm/lib/MC/MCSection.cpp +++ b/llvm/lib/MC/MCSection.cpp @@ -20,7 +20,7 @@ using namespace llvm; MCSection::MCSection(StringRef Name, bool IsText, bool IsBss, MCSymbol *Begin) : Begin(Begin), HasInstructions(false), IsRegistered(false), IsText(IsText), - IsBss(IsBss), LinkerRelaxable(false), Name(Name) { + IsBss(IsBss), Name(Name) { DummyFragment.setParent(this); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index a6e4a63..40d960e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5879,8 +5879,12 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, const LLT S32 = LLT::scalar(32); MachineRegisterInfo &MRI = *B.getMRI(); - std::tie(BaseReg, ImmOffset) = - AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); + // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before + // being added, so we can only safely match a 32-bit addition with no unsigned + // overflow. + bool CheckNUW = AMDGPU::isGFX1250(ST); + std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset( + MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW); // If BaseReg is a pointer, convert it to int. if (MRI.getType(BaseReg).isPointer()) diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index ff8efd2..0d2feeb 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -4933,6 +4933,43 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) { return false; } + // Packed math FP32 instructions typically accept SGPRs or VGPRs as source + // operands. On gfx12+, if a source operand uses SGPRs, the HW can only read + // the first SGPR and use it for both the low and high operations. + if (isPackedFP32Inst(Opc) && isGFX12Plus()) { + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); + int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi); + + const MCOperand &Src0 = Inst.getOperand(Src0Idx); + const MCOperand &Src1 = Inst.getOperand(Src1Idx); + unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); + unsigned OpSelHi = Inst.getOperand(OpSelHiIdx).getImm(); + + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + + auto VerifyOneSGPR = [OpSel, OpSelHi](unsigned Index) -> bool { + unsigned Mask = 1U << Index; + return ((OpSel & Mask) == 0) && ((OpSelHi & Mask) == 0); + }; + + if (Src0.isReg() && isSGPR(Src0.getReg(), TRI) && + !VerifyOneSGPR(/*Index=*/0)) + return false; + if (Src1.isReg() && isSGPR(Src1.getReg(), TRI) && + !VerifyOneSGPR(/*Index=*/1)) + return false; + + int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); + if (Src2Idx != -1) { + const MCOperand &Src2 = Inst.getOperand(Src2Idx); + if (Src2.isReg() && isSGPR(Src2.getReg(), TRI) && + !VerifyOneSGPR(/*Index=*/2)) + return false; + } + } + return true; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 8f44c03..5b327fb 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6106,6 +6106,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, case MVT::f64: return true; case MVT::f16: + case MVT::bf16: return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF); default: break; @@ -10877,6 +10878,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, } } +// Return whether the operation has NoUnsignedWrap property. +static bool isNoUnsignedWrap(SDValue Addr) { + return (Addr.getOpcode() == ISD::ADD && + Addr->getFlags().hasNoUnsignedWrap()) || + Addr->getOpcode() == ISD::OR; +} + bool SITargetLowering::shouldPreservePtrArith(const Function &F, EVT PtrVT) const { return UseSelectionDAGPTRADD && PtrVT == MVT::i64; @@ -10898,8 +10906,14 @@ SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const { if ((C1 = dyn_cast<ConstantSDNode>(N0))) N0 = SDValue(); else if (DAG.isBaseWithConstantOffset(N0)) { - C1 = cast<ConstantSDNode>(N0.getOperand(1)); - N0 = N0.getOperand(0); + // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before + // being added, so we can only safely match a 32-bit addition with no + // unsigned overflow. + bool CheckNUW = AMDGPU::isGFX1250(*Subtarget); + if (!CheckNUW || isNoUnsignedWrap(N0)) { + C1 = cast<ConstantSDNode>(N0.getOperand(1)); + N0 = N0.getOperand(0); + } } if (C1) { diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp index d8fe850..0a68512 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp @@ -51,7 +51,7 @@ static cl::opt<unsigned> namespace { enum HardClauseType { - // For GFX10: + // For GFX10 and GFX1250: // Texture, buffer, global or scratch memory instructions. HARDCLAUSE_VMEM, @@ -102,7 +102,8 @@ public: HardClauseType getHardClauseType(const MachineInstr &MI) { if (MI.mayLoad() || (MI.mayStore() && ST->shouldClusterStores())) { - if (ST->getGeneration() == AMDGPUSubtarget::GFX10) { + if (ST->getGeneration() == AMDGPUSubtarget::GFX10 || + ST->hasGFX1250Insts()) { if ((SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) || SIInstrInfo::isSegmentSpecificFLAT(MI)) { if (ST->hasNSAClauseBug()) { @@ -115,7 +116,6 @@ public: if (SIInstrInfo::isFLAT(MI)) return HARDCLAUSE_FLAT; } else { - assert(ST->getGeneration() >= AMDGPUSubtarget::GFX11); if (SIInstrInfo::isMIMG(MI)) { const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo = diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index f20b22d..19e6bcf 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -18,6 +18,7 @@ #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/LiveIntervals.h" @@ -5534,6 +5535,15 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more + // information. + if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) { + for (unsigned I = 0; I < 3; ++I) { + if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, I)) + return false; + } + } + return true; } @@ -6005,6 +6015,21 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx, const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx]; unsigned Opc = MI.getOpcode(); + // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more + // information. + if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) && + MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) { + constexpr const AMDGPU::OpName OpNames[] = { + AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2}; + + for (auto [I, OpName] : enumerate(OpNames)) { + int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]); + if (static_cast<unsigned>(SrcIdx) == OpIdx && + !isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, I, &MO)) + return false; + } + } + if (!isLegalRegOperand(MRI, OpInfo, MO)) return false; @@ -6053,6 +6078,39 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, return true; } +bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand( + const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, + const MachineOperand *MO) const { + constexpr const unsigned NumOps = 3; + constexpr const AMDGPU::OpName OpNames[NumOps * 2] = { + AMDGPU::OpName::src0, AMDGPU::OpName::src1, + AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers}; + + assert(SrcN < NumOps); + + if (!MO) { + int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]); + if (SrcIdx == -1) + return true; + MO = &MI.getOperand(SrcIdx); + } + + if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg())) + return true; + + int ModsIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]); + if (ModsIdx == -1) + return true; + + unsigned Mods = MI.getOperand(ModsIdx).getImm(); + bool OpSel = Mods & SISrcMods::OP_SEL_0; + bool OpSelHi = Mods & SISrcMods::OP_SEL_1; + + return !OpSel && !OpSelHi; +} + bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO) const { const MachineFunction &MF = *MI.getParent()->getParent(); @@ -6390,6 +6448,15 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) && !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg())) legalizeOpWithMove(MI, VOP3Idx[2]); + + // Fix the register class of packed FP32 instructions on gfx12+. See + // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information. + if (AMDGPU::isPackedFP32Inst(Opc) && AMDGPU::isGFX12Plus(ST)) { + for (unsigned I = 0; I < 3; ++I) { + if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, /*SrcN=*/I)) + legalizeOpWithMove(MI, VOP3Idx[I]); + } + } } Register SIInstrInfo::readlaneVGPRToSGPR( diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index e042b59..6b9403f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1287,6 +1287,19 @@ public: const MachineOperand &MO) const; bool isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx, const MachineOperand &MO) const; + + /// Check if \p MO would be a legal operand for gfx12+ packed math FP32 + /// instructions. Packed math FP32 instructions typically accept SGPRs or + /// VGPRs as source operands. On gfx12+, if a source operand uses SGPRs, the + /// HW can only read the first SGPR and use it for both the low and high + /// operations. + /// \p SrcN can be 0, 1, or 2, representing src0, src1, and src2, + /// respectively. If \p MO is nullptr, the operand corresponding to SrcN will + /// be used. + bool isLegalGFX12PlusPackedMathFP32Operand( + const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, + const MachineOperand *MO = nullptr) const; + /// Legalize operands in \p MI by either commuting it or inserting a /// copy of src1. void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 00dcb9b..1e3e9a2 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -3318,6 +3318,20 @@ unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) { return 128; } +bool isPackedFP32Inst(unsigned Opc) { + switch (Opc) { + case AMDGPU::V_PK_ADD_F32: + case AMDGPU::V_PK_ADD_F32_gfx12: + case AMDGPU::V_PK_MUL_F32: + case AMDGPU::V_PK_MUL_F32_gfx12: + case AMDGPU::V_PK_FMA_F32: + case AMDGPU::V_PK_FMA_F32_gfx12: + return true; + default: + return false; + } +} + } // namespace AMDGPU raw_ostream &operator<<(raw_ostream &OS, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 1252e35..1bcd36c 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1709,6 +1709,8 @@ bool isArgPassedInSGPR(const Argument *Arg); bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo); +LLVM_READONLY bool isPackedFP32Inst(unsigned Opc); + LLVM_READONLY bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST, int64_t EncodedOffset); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index ea99cc4..75d3cfa 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -802,6 +802,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::BSWAP, VT, Expand); } + if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps()) + setOperationAction(ISD::SCMP, MVT::i32, Custom); + + if (!Subtarget->hasV8_1MMainlineOps()) + setOperationAction(ISD::UCMP, MVT::i32, Custom); + setOperationAction(ISD::ConstantFP, MVT::f32, Custom); setOperationAction(ISD::ConstantFP, MVT::f64, Custom); @@ -1634,6 +1640,10 @@ bool ARMTargetLowering::useSoftFloat() const { return Subtarget->useSoftFloat(); } +bool ARMTargetLowering::shouldExpandCmpUsingSelects(EVT VT) const { + return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32; +} + // FIXME: It might make sense to define the representative register class as the // nearest super-register that has a non-null superset. For example, DPR_VFP2 is // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, @@ -10612,6 +10622,133 @@ SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op, return DAG.getBitcast(MVT::i32, Res); } +SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + // Determine if this is signed or unsigned comparison + bool IsSigned = (Op.getOpcode() == ISD::SCMP); + + // Special case for Thumb1 UCMP only + if (!IsSigned && Subtarget->isThumb1Only()) { + // For Thumb unsigned comparison, use this sequence: + // subs r2, r0, r1 ; r2 = LHS - RHS, sets flags + // sbc r2, r2 ; r2 = r2 - r2 - !carry + // cmp r1, r0 ; compare RHS with LHS + // sbc r1, r1 ; r1 = r1 - r1 - !carry + // subs r0, r2, r1 ; r0 = r2 - r1 (final result) + + // First subtraction: LHS - RHS + SDValue Sub1WithFlags = DAG.getNode( + ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS); + SDValue Sub1Result = Sub1WithFlags.getValue(0); + SDValue Flags1 = Sub1WithFlags.getValue(1); + + // SUBE: Sub1Result - Sub1Result - !carry + // This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned) + SDValue Sbc1 = + DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), + Sub1Result, Sub1Result, Flags1); + SDValue Sbc1Result = Sbc1.getValue(0); + + // Second comparison: RHS vs LHS (reverse comparison) + SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS); + + // SUBE: RHS - RHS - !carry + // This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned) + SDValue Sbc2 = DAG.getNode( + ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags); + SDValue Sbc2Result = Sbc2.getValue(0); + + // Final subtraction: Sbc1Result - Sbc2Result (no flags needed) + SDValue Result = + DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result); + if (Op.getValueType() != MVT::i32) + Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType()); + + return Result; + } + + // For the ARM assembly pattern: + // subs r0, r0, r1 ; subtract RHS from LHS and set flags + // movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for + // unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for + // signed, LO for unsigned) + // ; if LHS == RHS, result remains 0 from the subs + + // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC + unsigned Opcode = ARMISD::SUBC; + + // Check if RHS is a subtraction against 0: (0 - X) + if (RHS.getOpcode() == ISD::SUB) { + SDValue SubLHS = RHS.getOperand(0); + SDValue SubRHS = RHS.getOperand(1); + + // Check if it's 0 - X + if (isNullConstant(SubLHS)) { + bool CanUseAdd = false; + if (IsSigned) { + // For SCMP: only if X is known to never be INT_MIN (to avoid overflow) + if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS) + .getSignedMinValue() + .isMinSignedValue()) { + CanUseAdd = true; + } + } else { + // For UCMP: only if X is known to never be zero + if (DAG.isKnownNeverZero(SubRHS)) { + CanUseAdd = true; + } + } + + if (CanUseAdd) { + Opcode = ARMISD::ADDC; + RHS = SubRHS; // Replace RHS with X, so we do LHS + X instead of + // LHS - (0 - X) + } + } + } + + // Generate the operation with flags + SDValue OpWithFlags; + if (Opcode == ARMISD::ADDC) { + // Use ADDC: LHS + RHS (where RHS was 0 - X, now X) + OpWithFlags = DAG.getNode(ARMISD::ADDC, dl, + DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS); + } else { + // Use ARMISD::SUBC to generate SUBS instruction (subtract with flags) + OpWithFlags = DAG.getNode(ARMISD::SUBC, dl, + DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS); + } + + SDValue OpResult = OpWithFlags.getValue(0); // The operation result + SDValue Flags = OpWithFlags.getValue(1); // The flags + + // Constants for conditional moves + SDValue One = DAG.getConstant(1, dl, MVT::i32); + SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32); + + // Select condition codes based on signed vs unsigned + ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI; + ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO; + + // First conditional move: if greater than, set to 1 + SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32); + SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One, + GTCondValue, Flags); + + // Second conditional move: if less than, set to -1 + SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32); + SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne, + LTCondValue, Flags); + + if (Op.getValueType() != MVT::i32) + Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType()); + + return Result2; +} + SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump()); switch (Op.getOpcode()) { @@ -10740,6 +10877,9 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG); case ARMISD::WIN__DBZCHK: return SDValue(); + case ISD::UCMP: + case ISD::SCMP: + return LowerCMP(Op, DAG); } } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 825145d..a84a3cb 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -607,6 +607,8 @@ class VectorType; bool preferZeroCompareBranch() const override { return true; } + bool shouldExpandCmpUsingSelects(EVT VT) const override; + bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override; bool hasAndNotCompare(SDValue V) const override { @@ -904,6 +906,7 @@ class VectorType; void LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const; SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCMP(SDValue Op, SelectionDAG &DAG) const; Register getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const override; diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index fda9d97..ca5d27d 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -254,7 +254,8 @@ bool LoongArchAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) { MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN); F.setVarFixups({Fixup}); F.setLinkerRelaxable(); - F.getParent()->setLinkerRelaxable(); + if (!F.getParent()->isLinkerRelaxable()) + F.getParent()->setFirstLinkerRelaxable(F.getLayoutOrder()); return true; } diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index 67cc01e..e0ac591 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -674,6 +674,9 @@ static constexpr FeatureBitset XAndesGroup = { static constexpr DecoderListEntry DecoderList32[]{ // Vendor Extensions + {DecoderTableXCV32, XCVFeatureGroup, "CORE-V extensions"}, + {DecoderTableXRivos32, XRivosFeatureGroup, "Rivos"}, + {DecoderTableXqci32, XqciFeatureGroup, "Qualcomm uC Extensions"}, {DecoderTableXVentana32, {RISCV::FeatureVendorXVentanaCondOps}, "XVentanaCondOps"}, @@ -690,9 +693,6 @@ static constexpr DecoderListEntry DecoderList32[]{ "MIPS mips.pref"}, {DecoderTableXAndes32, XAndesGroup, "Andes extensions"}, // Standard Extensions - {DecoderTableXCV32, XCVFeatureGroup, "CORE-V extensions"}, - {DecoderTableXqci32, XqciFeatureGroup, "Qualcomm uC Extensions"}, - {DecoderTableXRivos32, XRivosFeatureGroup, "Rivos"}, {DecoderTable32, {}, "standard 32-bit instructions"}, {DecoderTableRV32Only32, {}, "RV32-only standard 32-bit instructions"}, {DecoderTableZfinx32, {}, "Zfinx (Float in Integer)"}, diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index a997ea5..8d956ce 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -32,6 +32,11 @@ static cl::opt<bool> ULEB128Reloc( "riscv-uleb128-reloc", cl::init(true), cl::Hidden, cl::desc("Emit R_RISCV_SET_ULEB128/E_RISCV_SUB_ULEB128 if appropriate")); +static cl::opt<bool> + AlignRvc("riscv-align-rvc", cl::init(true), cl::Hidden, + cl::desc("When generating R_RISCV_ALIGN, insert $alignment-2 " + "bytes of NOPs even in norvc code")); + RISCVAsmBackend::RISCVAsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI, bool Is64Bit, const MCTargetOptions &Options) : MCAsmBackend(llvm::endianness::little), STI(STI), OSABI(OSABI), @@ -306,12 +311,21 @@ void RISCVAsmBackend::relaxInstruction(MCInst &Inst, // If conditions are met, compute the padding size and create a fixup encoding // the padding size in the addend. bool RISCVAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) { - // Use default handling unless linker relaxation is enabled and the alignment - // is larger than the nop size. - const MCSubtargetInfo *STI = F.getSubtargetInfo(); - if (!STI->hasFeature(RISCV::FeatureRelax)) + // Alignments before the first linker-relaxable instruction have fixed sizes + // and do not require relocations. Alignments after a linker-relaxable + // instruction require a relocation, even if the STI specifies norelax. + // + // firstLinkerRelaxable is the layout order within the subsection, which may + // be smaller than the section's order. Therefore, alignments in a + // lower-numbered subsection may be unnecessarily treated as linker-relaxable. + auto *Sec = F.getParent(); + if (F.getLayoutOrder() <= Sec->firstLinkerRelaxable()) return false; - unsigned MinNopLen = STI->hasFeature(RISCV::FeatureStdExtZca) ? 2 : 4; + + // Use default handling unless the alignment is larger than the nop size. + const MCSubtargetInfo *STI = F.getSubtargetInfo(); + unsigned MinNopLen = + AlignRvc || STI->hasFeature(RISCV::FeatureStdExtZca) ? 2 : 4; if (F.getAlignment() <= MinNopLen) return false; @@ -321,7 +335,6 @@ bool RISCVAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) { MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_RISCV_ALIGN); F.setVarFixups({Fixup}); F.setLinkerRelaxable(); - F.getParent()->setLinkerRelaxable(); return true; } @@ -474,8 +487,9 @@ bool RISCVAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count, // TODO: emit a mapping symbol right here if (Count % 4 == 2) { - // The canonical nop with Zca is c.nop. - OS.write(STI->hasFeature(RISCV::FeatureStdExtZca) ? "\x01\0" : "\0\0", 2); + // The canonical nop with Zca is c.nop. For .balign 4, we generate a 2-byte + // c.nop even in a norvc region. + OS.write("\x01\0", 2); Count -= 2; } |