diff options
Diffstat (limited to 'llvm/lib/Target')
26 files changed, 428 insertions, 159 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index b3ec65c..2783147 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -366,6 +366,7 @@ def AArch64PostLegalizerCombiner select_to_minmax, or_to_bsp, combine_concat_vector, commute_constant_to_rhs, extract_vec_elt_combines, push_freeze_to_prevent_poison_from_propagating, - combine_mul_cmlt, combine_use_vector_truncate, - extmultomull, truncsat_combines, lshr_of_trunc_of_lshr]> { + combine_mul_cmlt, combine_use_vector_truncate, + extmultomull, truncsat_combines, lshr_of_trunc_of_lshr, + funnel_shift_from_or_shift_constants_are_legal]> { } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d16b116..60aa61e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9028,11 +9028,12 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( CallingConv::ID CallerCC = CallerF.getCallingConv(); // SME Streaming functions are not eligible for TCO as they may require - // the streaming mode or ZA to be restored after returning from the call. + // the streaming mode or ZA/ZT0 to be restored after returning from the call. SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, getRuntimeLibcallsInfo(), CLI); if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() || CallAttrs.requiresPreservingAllZAState() || + CallAttrs.requiresPreservingZT0() || CallAttrs.caller().hasStreamingBody()) return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index ce2b4a5..cd8b249 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -562,9 +562,13 @@ public: void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &); extern char &AMDGPURewriteAGPRCopyMFMALegacyID; +void initializeAMDGPUUniformIntrinsicCombineLegacyPass(PassRegistry &); +extern char &AMDGPUUniformIntrinsicCombineLegacyPassID; +FunctionPass *createAMDGPUUniformIntrinsicCombineLegacyPass(); + struct AMDGPUUniformIntrinsicCombinePass : public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> { - PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; namespace AMDGPU { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h index 0eb00cb..529da8d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -50,6 +50,7 @@ const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsic(unsigned Intr); struct ImageDimIntrinsicInfo { unsigned Intr; unsigned BaseOpcode; + unsigned AtomicNoRetBaseOpcode; MIMGDim Dim; uint8_t NumOffsetArgs; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 97c2c9c..9ce1224 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2006,19 +2006,27 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); + unsigned IntrOpcode = Intr->BaseOpcode; + + // For image atomic: use no-return opcode if result is unused. + if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) { + Register ResultDef = MI.getOperand(0).getReg(); + if (MRI->use_nodbg_empty(ResultDef)) + IntrOpcode = Intr->AtomicNoRetBaseOpcode; + } const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = - AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); + AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); - unsigned IntrOpcode = Intr->BaseOpcode; const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI); const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI); const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI); const unsigned ArgOffset = MI.getNumExplicitDefs() + 1; - Register VDataIn, VDataOut; + Register VDataIn = AMDGPU::NoRegister; + Register VDataOut = AMDGPU::NoRegister; LLT VDataTy; int NumVDataDwords = -1; bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 || @@ -2049,7 +2057,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( unsigned DMaskLanes = 0; if (BaseOpcode->Atomic) { - VDataOut = MI.getOperand(0).getReg(); + if (!BaseOpcode->NoReturn) + VDataOut = MI.getOperand(0).getReg(); VDataIn = MI.getOperand(2).getReg(); LLT Ty = MRI->getType(VDataIn); @@ -2099,8 +2108,9 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"); unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(); - if (BaseOpcode->Atomic) - CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization + // Keep GLC only when the atomic's result is actually used. + if (BaseOpcode->Atomic && !BaseOpcode->NoReturn) + CPol |= AMDGPU::CPol::GLC; if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | AMDGPU::CPol::VOLATILE)) return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index a6074ea..bf6f1a9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -30,7 +30,6 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass( MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) -MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass()) #undef MODULE_PASS #ifndef MODULE_PASS_WITH_PARAMS @@ -69,6 +68,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes", AMDGPUUnifyDivergentExitNodesPass()) FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass()) FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this))) +FUNCTION_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass()) #undef FUNCTION_PASS #ifndef FUNCTION_ANALYSIS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 6214f4d..75a94ac 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -619,6 +619,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR); initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR); initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR); + initializeAMDGPUUniformIntrinsicCombineLegacyPass(*PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -887,9 +888,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { if (EarlyInlineAll && !EnableFunctionCalls) PM.addPass(AMDGPUAlwaysInlinePass()); - - if (EnableUniformIntrinsicCombine) - PM.addPass(AMDGPUUniformIntrinsicCombinePass()); }); PB.registerPeepholeEPCallback( @@ -900,6 +898,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { FPM.addPass(AMDGPUUseNativeCallsPass()); if (EnableLibCallSimplify) FPM.addPass(AMDGPUSimplifyLibCallsPass()); + + if (EnableUniformIntrinsicCombine) + FPM.addPass(AMDGPUUniformIntrinsicCombinePass()); }); PB.registerCGSCCOptimizerLateEPCallback( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp index 50c78d8..65e6ed9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp @@ -16,12 +16,6 @@ /// uniformity. And every instruction that's downstream and cares about dynamic /// uniformity must be convergent (and isel will introduce v_readfirstlane for /// them if their operands can't be proven statically uniform). -/// -/// This pass is implemented as a ModulePass because intrinsic declarations -/// exist at the module scope, allowing us to skip processing entirely if no -/// declarations are present and to traverse their user lists directly when -/// they are. A FunctionPass would instead require scanning every instruction -/// in every function to find relevant intrinsics, which is far less efficient. //===----------------------------------------------------------------------===// #include "AMDGPU.h" @@ -97,14 +91,12 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II, Tracker[NotOp] = true; // NOT preserves uniformity LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n'); ICmp->replaceAllUsesWith(NotOp); - ICmp->eraseFromParent(); Changed = true; } else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) { // Case: (icmp ne %ballot, 0) -> %ballot_arg LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: " << *Src << '\n'); ICmp->replaceAllUsesWith(Src); - ICmp->eraseFromParent(); Changed = true; } } @@ -120,15 +112,17 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II, return false; } -/// Iterates over intrinsic declarations in the module to optimize their uses. -static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) { +/// Iterates over intrinsic calls in the Function to optimize. +static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) { bool IsChanged = false; ValueMap<const Value *, bool> Tracker; - FunctionAnalysisManager &FAM = - AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); - for (Function &F : M) { - switch (F.getIntrinsicID()) { + for (Instruction &I : make_early_inc_range(instructions(F))) { + auto *II = dyn_cast<IntrinsicInst>(&I); + if (!II) + continue; + + switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_permlane64: case Intrinsic::amdgcn_readfirstlane: case Intrinsic::amdgcn_readlane: @@ -137,23 +131,61 @@ static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) { default: continue; } - - for (User *U : make_early_inc_range(F.users())) { - auto *II = cast<IntrinsicInst>(U); - Function *ParentF = II->getFunction(); - const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF); - IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker); - } + IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker); } return IsChanged; } PreservedAnalyses -AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) { - if (!runUniformIntrinsicCombine(M, AM)) +AMDGPUUniformIntrinsicCombinePass::run(Function &F, + FunctionAnalysisManager &AM) { + const auto &UI = AM.getResult<UniformityInfoAnalysis>(F); + if (!runUniformIntrinsicCombine(F, UI)) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserve<UniformityInfoAnalysis>(); return PA; } + +namespace { +class AMDGPUUniformIntrinsicCombineLegacy : public FunctionPass { +public: + static char ID; + AMDGPUUniformIntrinsicCombineLegacy() : FunctionPass(ID) { + initializeAMDGPUUniformIntrinsicCombineLegacyPass( + *PassRegistry::getPassRegistry()); + } + +private: + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<UniformityInfoWrapperPass>(); + AU.addRequired<TargetPassConfig>(); + } +}; +} // namespace + +char AMDGPUUniformIntrinsicCombineLegacy::ID = 0; +char &llvm::AMDGPUUniformIntrinsicCombineLegacyPassID = + AMDGPUUniformIntrinsicCombineLegacy::ID; + +bool AMDGPUUniformIntrinsicCombineLegacy::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + const UniformityInfo &UI = + getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); + return runUniformIntrinsicCombine(F, UI); +} + +INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE, + "AMDGPU Uniform Intrinsic Combine", false, false) +INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE, + "AMDGPU Uniform Intrinsic Combine", false, false) + +FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() { + return new AMDGPUUniformIntrinsicCombineLegacy(); +} diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 5f6d742..d950131 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -877,69 +877,69 @@ multiclass MIMG_Store <mimgopc op, string asm, bit has_d16, bit mip = 0> { } class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, string dns=""> - : MIMG_gfx6789 <op, (outs data_rc:$vdst), dns> { - let Constraints = "$vdst = $vdata"; - + RegisterClass addr_rc, bit noRtn, string dns=""> + : MIMG_gfx6789 <op, !if(noRtn, (outs), (outs data_rc:$vdst)), dns> { + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da); - let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da"; + let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da"; } class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, string dns=""> - : MIMG_gfx90a <op, (outs getAlign2RegOp<data_rc>.ret:$vdst), dns> { - let Constraints = "$vdst = $vdata"; - + RegisterClass addr_rc, bit noRtn, string dns=""> + : MIMG_gfx90a <op, !if(noRtn, (outs), (outs getAlign2RegOp<data_rc>.ret:$vdst)), dns> { + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins getAlign2RegOp<data_rc>.ret:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da); - let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da"; + let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da"; } class MIMG_Atomic_si<mimgopc op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, bit enableDasm = 0> - : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc, + RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0> + : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX6GFX7", "")> { let AssemblerPredicate = isGFX6GFX7; } class MIMG_Atomic_vi<mimgopc op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, bit enableDasm = 0> - : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX8", "")> { + RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0> + : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX8", "")> { let AssemblerPredicate = isGFX8GFX9NotGFX90A; let MIMGEncoding = MIMGEncGfx8; } class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, bit enableDasm = 0> - : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX90A", "")> { + RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0> + : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX90A", "")> { let AssemblerPredicate = isGFX90APlus; let MIMGEncoding = MIMGEncGfx90a; } class MIMG_Atomic_gfx10<mimgopc op, string opcode, RegisterOperand DataRC, RegisterClass AddrRC, - bit enableDisasm = 0> - : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdst), + bit noRtn = 0, bit enableDisasm = 0> + : MIMG_gfx10<op.GFX10M, !if(noRtn, (outs), (outs DataRC:$vdst)), !if(enableDisasm, "GFX10", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe); - let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; } class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode, RegisterOperand DataRC, int num_addrs, - bit enableDisasm = 0> - : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdst), num_addrs, + bit noRtn = 0, bit enableDisasm = 0> + : MIMG_nsa_gfx10<op.GFX10M, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs, !if(enableDisasm, "GFX10", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = !con((ins DataRC:$vdata), AddrIns, (ins SReg_256_XNULL:$srsrc, DMask:$dmask, @@ -950,24 +950,24 @@ class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode, class MIMG_Atomic_gfx11<mimgopc op, string opcode, RegisterOperand DataRC, RegisterClass AddrRC, - bit enableDisasm = 0> - : MIMG_gfx11<op.GFX11, (outs DataRC:$vdst), + bit noRtn = 0, bit enableDisasm = 0> + : MIMG_gfx11<op.GFX11, !if(noRtn, (outs), (outs DataRC:$vdst)), !if(enableDisasm, "GFX11", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe); - let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; } class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode, RegisterOperand DataRC, int num_addrs, - bit enableDisasm = 0> - : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdst), num_addrs, + bit noRtn = 0, bit enableDisasm = 0> + : MIMG_nsa_gfx11<op.GFX11, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs, !if(enableDisasm, "GFX11", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = !con((ins DataRC:$vdata), AddrIns, (ins SReg_256_XNULL:$srsrc, DMask:$dmask, @@ -977,11 +977,11 @@ class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode, } class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterOperand DataRC, - int num_addrs, string renamed, bit enableDisasm = 0> - : VIMAGE_gfx12<op.GFX12, (outs DataRC:$vdst), num_addrs, + int num_addrs, string renamed, bit noRtn = 0, bit enableDisasm = 0> + : VIMAGE_gfx12<op.GFX12, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs, !if(enableDisasm, "GFX12", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = !con((ins DataRC:$vdata), AddrIns, (ins SReg_256_XNULL:$rsrc, DMask:$dmask, Dim:$dim, @@ -994,95 +994,96 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm, RegisterOperand data_rc, bit enableDasm = 0, bit isFP = 0, + bit noRtn = 0, string renamed = ""> { let hasSideEffects = 1, // FIXME: remove this mayLoad = 1, mayStore = 1, hasPostISelHook = 0, DisableWQM = 1, - FPAtomic = isFP in { + FPAtomic = isFP, IsAtomicNoRet = noRtn in { let VAddrDwords = 1 in { let ssamp = 0 in { if op.HAS_SI then { - def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; } if op.HAS_VI then { - def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; let hasPostISelHook = 1 in - def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; } if op.HAS_GFX10M then { - def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; } if op.HAS_GFX11 then { - def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; } } if op.HAS_GFX12 then { - def _V1_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 1, renamed>; + def _V1_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 1, renamed, noRtn>; } } let VAddrDwords = 2 in { let ssamp = 0 in { if op.HAS_SI then { - def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, 0>; + def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, noRtn, 0>; } if op.HAS_VI then { - def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>; - def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64_Align2, 0>; + def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, noRtn, 0>; + def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64_Align2, noRtn, 0>; } if op.HAS_GFX10M then { - def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>; - def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>; + def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, noRtn, 0>; + def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, noRtn, 0>; } if op.HAS_GFX11 then { - def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, 0>; - def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, 0>; + def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, noRtn, 0>; + def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, noRtn, 0>; } } if op.HAS_GFX12 then { - def _V2_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 2, renamed>; + def _V2_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 2, renamed, noRtn>; } } let VAddrDwords = 3 in { let ssamp = 0 in { if op.HAS_SI then { - def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, 0>; + def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, noRtn, 0>; } if op.HAS_VI then { - def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>; - def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96_Align2, 0>; + def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, noRtn, 0>; + def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96_Align2, noRtn, 0>; } if op.HAS_GFX10M then { - def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>; - def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>; + def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, noRtn, 0>; + def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, noRtn, 0>; } if op.HAS_GFX11 then { - def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, 0>; - def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, 0>; + def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, noRtn, 0>; + def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, noRtn, 0>; } } if op.HAS_GFX12 then { - def _V3_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 3, renamed>; + def _V3_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 3, renamed, noRtn>; } } let VAddrDwords = 4 in { let ssamp = 0 in { if op.HAS_SI then { - def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>; + def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, noRtn, 0>; } if op.HAS_VI then { - def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>; - def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128_Align2, 0>; + def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, noRtn, 0>; + def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128_Align2, noRtn, 0>; } if op.HAS_GFX10M then { - def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>; - def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>; + def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, noRtn, 0>; + def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, noRtn, enableDasm>; } if op.HAS_GFX11 then { - def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, 0>; - def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, enableDasm>; + def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, noRtn, 0>; + def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, noRtn, enableDasm>; } } if op.HAS_GFX12 then { - def _V4_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 4, renamed, enableDasm>; + def _V4_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 4, renamed, noRtn, enableDasm>; } } } @@ -1095,12 +1096,13 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm, } } -multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0, - string renamed = ""> { // 64-bit atomics - let IsAtomicRet = 1 in { +multiclass MIMG_Atomic_Base <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0, + bit noRtn = 0, string renamed = ""> { // 64-bit atomics + let IsAtomicRet = !not(noRtn) in { def "" : MIMGBaseOpcode { let Atomic = 1; let AtomicX2 = isCmpSwap; + let NoReturn = noRtn; } let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in { @@ -1109,22 +1111,28 @@ multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0, // Other variants are reconstructed by disassembler using dmask and tfe. if !not(isCmpSwap) then { let VDataDwords = 1 in - defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_32, 1, isFP, renamed>; + defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_32, 1, isFP, noRtn, renamed>; } let VDataDwords = 2 in - defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_64, isCmpSwap, isFP, renamed>; + defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_64, isCmpSwap, isFP, noRtn, renamed>; let VDataDwords = 3 in - defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_96, 0, isFP, renamed>; + defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_96, 0, isFP, noRtn, renamed>; if isCmpSwap then { let VDataDwords = 4 in - defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_128, 0, isFP, renamed>; + defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_128, 0, isFP, noRtn, renamed>; let VDataDwords = 5 in - defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_160, 0, isFP, renamed>; + defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_160, 0, isFP, noRtn, renamed>; } } - } // End IsAtomicRet = 1 + } +} + +multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0, + string renamed = ""> { + defm "" : MIMG_Atomic_Base <op, asm, isCmpSwap, isFP, /*noRtn=*/0, renamed>; + defm "_NORTN" : MIMG_Atomic_Base <op, asm, isCmpSwap, isFP, /*noRtn=*/1, renamed>; } multiclass MIMG_Atomic_Renamed <mimgopc op, string asm, string renamed, @@ -1820,6 +1828,7 @@ let SubtargetPredicate = isGFX12Plus in { class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> { Intrinsic Intr = I; MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod)); + MIMGBaseOpcode AtomicNoRetBaseOpcode = BaseOpcode; AMDGPUDimProps Dim = I.P.Dim; AMDGPUImageDimIntrinsicEval DimEval = AMDGPUImageDimIntrinsicEval<I.P>; @@ -1855,13 +1864,20 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> { bits<8> CoordTyArg = !add(GradientTyArg, !if(I.P.Gradients, 1, 0)); } +class ImageDimAtomicIntrinsicInfo<AMDGPUImageDimIntrinsic I> + : ImageDimIntrinsicInfo<I> { + MIMGBaseOpcode AtomicNoRetBaseOpcode = + !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod, "_NORTN")); +} + def ImageDimIntrinsicTable : GenericTable { let FilterClass = "ImageDimIntrinsicInfo"; - let Fields = ["Intr", "BaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs", - "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd", - "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex", + let Fields = ["Intr", "BaseOpcode", "AtomicNoRetBaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData", + "NumVAddrs", "NumArgs", "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex", + "VAddrEnd", "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex", "BiasTyArg", "GradientTyArg", "CoordTyArg"]; string TypeOf_BaseOpcode = "MIMGBaseOpcode"; + string TypeOf_AtomicNoRetBaseOpcode = "MIMGBaseOpcode"; string TypeOf_Dim = "MIMGDim"; let PrimaryKey = ["Intr"]; @@ -1874,11 +1890,14 @@ def getImageDimIntrinsicByBaseOpcode : SearchIndex { let Key = ["BaseOpcode", "Dim"]; } -foreach intr = !listconcat(AMDGPUImageDimIntrinsics, - AMDGPUImageDimAtomicIntrinsics) in { +foreach intr = AMDGPUImageDimIntrinsics in { def : ImageDimIntrinsicInfo<intr>; } +foreach intr = AMDGPUImageDimAtomicIntrinsics in { + def : ImageDimAtomicIntrinsicInfo<intr>; +} + // L to LZ Optimization Mapping def : MIMGLZMapping<IMAGE_SAMPLE_L, IMAGE_SAMPLE_LZ>; def : MIMGLZMapping<IMAGE_SAMPLE_C_L, IMAGE_SAMPLE_C_LZ>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index be42291..b34ab2a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9134,16 +9134,23 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SDLoc DL(Op); MachineFunction &MF = DAG.getMachineFunction(); const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>(); + unsigned IntrOpcode = Intr->BaseOpcode; + // For image atomic: use no-return opcode if result is unused. + if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode && + !Op.getNode()->hasAnyUseOfValue(0)) + IntrOpcode = Intr->AtomicNoRetBaseOpcode; const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = - AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); + AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); - unsigned IntrOpcode = Intr->BaseOpcode; bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget); bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget); bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget); SmallVector<EVT, 3> ResultTypes(Op->values()); SmallVector<EVT, 3> OrigResultTypes(Op->values()); + if (BaseOpcode->NoReturn && BaseOpcode->Atomic) + ResultTypes.erase(&ResultTypes[0]); + bool IsD16 = false; bool IsG16 = false; bool IsA16 = false; @@ -9162,8 +9169,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op, VData = Op.getOperand(2); IsAtomicPacked16Bit = - (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || - Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16); + (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || + IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN || + IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 || + IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN); bool Is64Bit = VData.getValueSizeInBits() == 64; if (BaseOpcode->AtomicX2) { @@ -9173,7 +9182,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op, if (Is64Bit) VData = DAG.getBitcast(MVT::v4i32, VData); - ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32; + if (!BaseOpcode->NoReturn) + ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32; + DMask = Is64Bit ? 0xf : 0x3; NumVDataDwords = Is64Bit ? 4 : 2; } else { @@ -9399,8 +9410,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex); - if (BaseOpcode->Atomic) - CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization + // Keep GLC only when the atomic's result is actually used. + if (BaseOpcode->Atomic && !BaseOpcode->NoReturn) + CPol |= AMDGPU::CPol::GLC; if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | AMDGPU::CPol::VOLATILE)) return Op; @@ -9512,13 +9524,20 @@ SDValue SITargetLowering::lowerImage(SDValue Op, DAG.setNodeMemRefs(NewNode, {MemRef}); } + if (BaseOpcode->NoReturn) { + if (BaseOpcode->Atomic) + return DAG.getMergeValues( + {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL); + + return SDValue(NewNode, 0); + } + if (BaseOpcode->AtomicX2) { SmallVector<SDValue, 1> Elt; DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1); return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL); } - if (BaseOpcode->NoReturn) - return SDValue(NewNode, 0); + return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail, Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes, NumVDataDwords, IsAtomicPacked16Bit, DL); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 313ae3d..fdba454 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1298,12 +1298,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); } - // Use __sincos_stret if available. - if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && - getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); - } + setOperationAction(ISD::FSINCOS, MVT::f64, Custom); + setOperationAction(ISD::FSINCOS, MVT::f32, Custom); // FP-ARMv8 implements a lot of rounding-like FP operations. if (Subtarget->hasFPARMv8Base()) { @@ -9835,13 +9831,18 @@ static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) { } SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { - assert(Subtarget->isTargetDarwin()); - // For iOS, we want to call an alternative entry point: __sincos_stret, // return values are passed via sret. SDLoc dl(Op); SDValue Arg = Op.getOperand(0); EVT ArgVT = Arg.getValueType(); + RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT); + RTLIB::LibcallImpl SincosStret = getLibcallImpl(LC); + if (SincosStret == RTLIB::Unsupported) + return SDValue(); + + assert(Subtarget->isTargetDarwin()); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); auto PtrVT = getPointerTy(DAG.getDataLayout()); @@ -9871,11 +9872,9 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { Args.emplace_back(Arg, ArgTy); - RTLIB::Libcall LC = - (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; - const char *LibcallName = getLibcallName(LC); - CallingConv::ID CC = getLibcallCallingConv(LC); - SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); + StringRef LibcallName = getLibcallImplName(SincosStret); + CallingConv::ID CC = getLibcallImplCallingConv(SincosStret); + SDValue Callee = DAG.getExternalSymbol(LibcallName.data(), getPointerTy(DL)); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 44c4830..7ae500a 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -1058,6 +1058,16 @@ def WaveActiveOp : DXILOp<119, waveActiveOp> { IntrinArgIndex<0>, IntrinArgI8<WaveOpKind_Max>, IntrinArgI8<SignedOpKind_Unsigned> ]>, + IntrinSelect<int_dx_wave_reduce_min, + [ + IntrinArgIndex<0>, IntrinArgI8<WaveOpKind_Min>, + IntrinArgI8<SignedOpKind_Signed> + ]>, + IntrinSelect<int_dx_wave_reduce_umin, + [ + IntrinArgIndex<0>, IntrinArgI8<WaveOpKind_Min>, + IntrinArgI8<SignedOpKind_Unsigned> + ]>, ]; let arguments = [OverloadTy, Int8Ty, Int8Ty]; diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp index e7e7f2c..ce6e812 100644 --- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp +++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp @@ -94,6 +94,8 @@ static bool checkWaveOps(Intrinsic::ID IID) { case Intrinsic::dx_wave_reduce_usum: case Intrinsic::dx_wave_reduce_max: case Intrinsic::dx_wave_reduce_umax: + case Intrinsic::dx_wave_reduce_min: + case Intrinsic::dx_wave_reduce_umin: return true; } } diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp index 68fd3e0..60dfd96 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp @@ -55,8 +55,10 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable( case Intrinsic::dx_splitdouble: case Intrinsic::dx_wave_readlane: case Intrinsic::dx_wave_reduce_max: + case Intrinsic::dx_wave_reduce_min: case Intrinsic::dx_wave_reduce_sum: case Intrinsic::dx_wave_reduce_umax: + case Intrinsic::dx_wave_reduce_umin: case Intrinsic::dx_wave_reduce_usum: case Intrinsic::dx_imad: case Intrinsic::dx_umad: diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index 54c8972..0573f64 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -1061,8 +1061,11 @@ HexagonTargetLowering::createHvxPrefixPred(SDValue PredV, const SDLoc &dl, SDValue W0 = isUndef(PredV) ? DAG.getUNDEF(MVT::i64) : DAG.getNode(HexagonISD::P2D, dl, MVT::i64, PredV); - Words[IdxW].push_back(HiHalf(W0, DAG)); - Words[IdxW].push_back(LoHalf(W0, DAG)); + if (Bytes < BitBytes) { + Words[IdxW].push_back(HiHalf(W0, DAG)); + Words[IdxW].push_back(LoHalf(W0, DAG)); + } else + Words[IdxW].push_back(W0); while (Bytes < BitBytes) { IdxW ^= 1; @@ -1083,7 +1086,26 @@ HexagonTargetLowering::createHvxPrefixPred(SDValue PredV, const SDLoc &dl, Bytes *= 2; } + while (Bytes > BitBytes) { + IdxW ^= 1; + Words[IdxW].clear(); + + if (Bytes <= 4) { + for (const SDValue &W : Words[IdxW ^ 1]) { + SDValue T = contractPredicate(W, dl, DAG); + Words[IdxW].push_back(T); + } + } else { + for (const SDValue &W : Words[IdxW ^ 1]) { + Words[IdxW].push_back(W); + } + } + Bytes /= 2; + } + assert(Bytes == BitBytes); + if (BitBytes == 1 && PredTy == MVT::v2i1) + ByteTy = MVT::getVectorVT(MVT::i16, HwLen); SDValue Vec = ZeroFill ? getZero(dl, ByteTy, DAG) : DAG.getUNDEF(ByteTy); SDValue S4 = DAG.getConstant(HwLen-4, dl, MVT::i32); @@ -3157,6 +3179,9 @@ SDValue HexagonTargetLowering::SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const { auto *MemN = cast<MemSDNode>(Op.getNode()); + if (!MemN->getMemoryVT().isSimple()) + return Op; + MVT MemTy = MemN->getMemoryVT().getSimpleVT(); if (!isHvxPairTy(MemTy)) return Op; diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 4029e14..729c077 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -493,7 +493,7 @@ NVPTXTTIImpl::getInstructionCost(const User *U, // predicate ("@"). return !AsmInst.empty() && (AsmInst[0] == '@' || isAlpha(AsmInst[0]) || - AsmInst.find(".pragma") != StringRef::npos); + AsmInst.contains(".pragma")); }); return InstCount * TargetTransformInfo::TCC_Basic; } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 17f04d0..20fc849 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -138,6 +138,11 @@ static cl::opt<unsigned> PPCMinimumJumpTableEntries( "ppc-min-jump-table-entries", cl::init(64), cl::Hidden, cl::desc("Set minimum number of entries to use a jump table on PPC")); +static cl::opt<unsigned> PPCMinimumBitTestCmps( + "ppc-min-bit-test-cmps", cl::init(3), cl::Hidden, + cl::desc("Set minimum of largest number of comparisons to use bit test for " + "switch on PPC.")); + static cl::opt<unsigned> PPCGatherAllAliasesMaxDepth( "ppc-gather-alias-max-depth", cl::init(18), cl::Hidden, cl::desc("max depth when checking alias info in GatherAllAliases()")); @@ -1436,6 +1441,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // Re-evaluate this value on future HWs that can do better with mtctr. setMinimumJumpTableEntries(PPCMinimumJumpTableEntries); + // The default minimum of largest number in a BitTest cluster is 3. + setMinimumBitTestCmps(PPCMinimumBitTestCmps); + setMinFunctionAlignment(Align(4)); setMinCmpXchgSizeInBits(Subtarget.hasPartwordAtomics() ? 8 : 32); diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td index b0bed71c..da3efdc 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td @@ -194,6 +194,22 @@ class XX3Form_XTAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, let Inst{31} = XT{5}; } +class XForm_RBS5<bits<6> opCode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opCode, OOL, IOL, asmstr, itin> { + + bits<5> RB; + bits<5> RS; + + let Pattern = pattern; + + let Inst{6...10} = RS; + let Inst{11...15} = 0; + let Inst{16...20} = RB; + let Inst{21...30} = xo; + let Inst{31} = 0; +} + class XX3Form_XTAB6_S<bits<5> xo, dag OOL, dag IOL, string asmstr, list<dag> pattern> : I<59, OOL, IOL, asmstr, NoItinerary> { @@ -317,12 +333,16 @@ let Predicates = [IsISAFuture] in { def TLBIEIO : XForm_RSB5_UIMM2<31, 18, (outs), (ins g8rc:$RB, g8rc:$RS, u2imm:$RIC), "tlbieio $RB, $RS, $RIC", []>; + def MTLPL : XForm_RBS5<31, 275, (outs), (ins gprc:$RB, gprc:$RS), + "mtlpl $RB, $RS", IIC_SprMTSPR, []>; let Interpretation64Bit = 1, isCodeGenOnly = 1 in { def TLBIEP8 : XForm_RSB5_UIMM2_2UIMM1<31, 50, (outs), (ins g8rc:$RB, g8rc:$RS, u2imm:$RIC, u1imm:$PRS, u1imm:$R), "tlbiep $RB, $RS, $RIC, $PRS, $R", []>; + def MTLPL8 : XForm_RBS5<31, 275, (outs), (ins g8rc:$RB, g8rc:$RS), + "mtlpl $RB, $RS", IIC_SprMTSPR, []>, isPPC64; } } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index 41a9c92..96e8afc 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -823,6 +823,7 @@ static bool relaxableFixupNeedsRelocation(const MCFixupKind Kind) { break; case RISCV::fixup_riscv_rvc_jump: case RISCV::fixup_riscv_rvc_branch: + case RISCV::fixup_riscv_rvc_imm: case RISCV::fixup_riscv_jal: return false; } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp index 6d587e6..5934c91 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp @@ -688,6 +688,7 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo, // the `jal` again in the assembler. } else if (MIFrm == RISCVII::InstFormatCI) { FixupKind = RISCV::fixup_riscv_rvc_imm; + AsmRelaxToLinkerRelaxableWithFeature(RISCV::FeatureVendorXqcili); } else if (MIFrm == RISCVII::InstFormatI) { FixupKind = RISCV::fixup_riscv_12_i; } else if (MIFrm == RISCVII::InstFormatQC_EB) { diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp index 98b636e..9bd66a4 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp @@ -373,6 +373,26 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI, .addReg(ScratchReg) .addImm(-1); break; + case AtomicRMWInst::Max: + BuildMI(LoopMBB, DL, TII->get(RISCV::MAX), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; + case AtomicRMWInst::Min: + BuildMI(LoopMBB, DL, TII->get(RISCV::MIN), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; + case AtomicRMWInst::UMax: + BuildMI(LoopMBB, DL, TII->get(RISCV::MAXU), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; + case AtomicRMWInst::UMin: + BuildMI(LoopMBB, DL, TII->get(RISCV::MINU), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; } BuildMI(LoopMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)), ScratchReg) .addReg(ScratchReg) @@ -682,6 +702,9 @@ bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, AtomicRMWInst::BinOp BinOp, bool IsMasked, int Width, MachineBasicBlock::iterator &NextMBBI) { + // Using MIN(U)/MAX(U) is preferrable if permitted + if (STI->hasPermissiveZalrsc() && STI->hasStdExtZbb() && !IsMasked) + return expandAtomicBinOp(MBB, MBBI, BinOp, IsMasked, Width, NextMBBI); MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 2754d78..b4556f6 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1906,6 +1906,25 @@ def FeatureForcedAtomics : SubtargetFeature< def HasAtomicLdSt : Predicate<"Subtarget->hasStdExtZalrsc() || Subtarget->hasForcedAtomics()">; +// The RISC-V Unprivileged Architecture - ISA Volume 1 (Version: 20250508) +// [https://docs.riscv.org/reference/isa/_attachments/riscv-unprivileged.pdf] +// in section 13.3. Eventual Success of Store-Conditional Instructions, defines +// _constrained_ LR/SC loops: +// The dynamic code executed between the LR and SC instructions can only +// contain instructions from the base ''I'' instruction set, excluding loads, +// stores, backward jumps, taken backward branches, JALR, FENCE, and SYSTEM +// instructions. Compressed forms of the aforementioned ''I'' instructions in +// the Zca and Zcb extensions are also permitted. +// LR/SC loops that do not adhere to the above are _unconstrained_ LR/SC loops, +// and success is implementation specific. For implementations which know that +// non-base instructions (such as the ''B'' extension) will not violate any +// forward progress guarantees, using these instructions to reduce the LR/SC +// sequence length is desirable. +def FeaturePermissiveZalrsc + : SubtargetFeature< + "permissive-zalrsc", "HasPermissiveZalrsc", "true", + "Implementation permits non-base instructions between LR/SC pairs">; + def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals", "AllowTaggedGlobals", "true", "Use an instruction sequence for taking the address of a global " diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index 6181abb..47022b3 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -745,7 +745,7 @@ Register SPIRVGlobalRegistry::buildGlobalVariable( .addDef(ResVReg) .addUse(getSPIRVTypeID(BaseType)) .addImm(static_cast<uint32_t>(Storage)); - if (Init != 0) + if (Init) MIB.addUse(Init->getOperand(0).getReg()); // ISel may introduce a new register on this step, so we need to add it to // DT and correct its type avoiding fails on the next stage. diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 021353a..3fea21e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -222,6 +222,9 @@ private: bool selectWaveReduceMax(Register ResVReg, const SPIRVType *ResType, MachineInstr &I, bool IsUnsigned) const; + bool selectWaveReduceMin(Register ResVReg, const SPIRVType *ResType, + MachineInstr &I, bool IsUnsigned) const; + bool selectWaveReduceSum(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const; @@ -2456,6 +2459,35 @@ bool SPIRVInstructionSelector::selectWaveReduceMax(Register ResVReg, .constrainAllUses(TII, TRI, RBI); } +bool SPIRVInstructionSelector::selectWaveReduceMin(Register ResVReg, + const SPIRVType *ResType, + MachineInstr &I, + bool IsUnsigned) const { + assert(I.getNumOperands() == 3); + assert(I.getOperand(2).isReg()); + MachineBasicBlock &BB = *I.getParent(); + Register InputRegister = I.getOperand(2).getReg(); + SPIRVType *InputType = GR.getSPIRVTypeForVReg(InputRegister); + + if (!InputType) + report_fatal_error("Input Type could not be determined."); + + SPIRVType *IntTy = GR.getOrCreateSPIRVIntegerType(32, I, TII); + // Retreive the operation to use based on input type + bool IsFloatTy = GR.isScalarOrVectorOfType(InputRegister, SPIRV::OpTypeFloat); + auto IntegerOpcodeType = + IsUnsigned ? SPIRV::OpGroupNonUniformUMin : SPIRV::OpGroupNonUniformSMin; + auto Opcode = IsFloatTy ? SPIRV::OpGroupNonUniformFMin : IntegerOpcodeType; + return BuildMI(BB, I, I.getDebugLoc(), TII.get(Opcode)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(GR.getOrCreateConstInt(SPIRV::Scope::Subgroup, I, IntTy, TII, + !STI.isShader())) + .addImm(SPIRV::GroupOperation::Reduce) + .addUse(I.getOperand(2).getReg()) + .constrainAllUses(TII, TRI, RBI); +} + bool SPIRVInstructionSelector::selectWaveReduceSum(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const { @@ -3431,6 +3463,10 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, return selectWaveReduceMax(ResVReg, ResType, I, /*IsUnsigned*/ true); case Intrinsic::spv_wave_reduce_max: return selectWaveReduceMax(ResVReg, ResType, I, /*IsUnsigned*/ false); + case Intrinsic::spv_wave_reduce_umin: + return selectWaveReduceMin(ResVReg, ResType, I, /*IsUnsigned*/ true); + case Intrinsic::spv_wave_reduce_min: + return selectWaveReduceMin(ResVReg, ResType, I, /*IsUnsigned*/ false); case Intrinsic::spv_wave_reduce_sum: return selectWaveReduceSum(ResVReg, ResType, I); case Intrinsic::spv_wave_readlane: diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 3da720f..58109ac 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -8973,8 +8973,7 @@ SystemZTargetLowering::getJumpConditionMergingParams(Instruction::BinaryOps Opc, if (const auto *CB = dyn_cast<CallBase>(RHSVal)) { if (CB->isInlineAsm()) { const InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand()); - return IA && - IA->getConstraintString().find("{@cc}") != std::string::npos; + return IA && IA->getConstraintString().contains("{@cc}"); } } } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 410f20e..5785440 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2572,11 +2572,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // Combine sin / cos into _sincos_stret if it is available. - if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && - getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); - } + setOperationAction(ISD::FSINCOS, MVT::f64, Custom); + setOperationAction(ISD::FSINCOS, MVT::f32, Custom); if (Subtarget.isTargetWin64()) { setOperationAction(ISD::SDIV, MVT::i128, Custom); @@ -33067,26 +33064,30 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) { static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Arg = Op.getOperand(0); + EVT ArgVT = Arg.getValueType(); + bool isF64 = ArgVT == MVT::f64; + + RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; + const char *LibcallName = TLI.getLibcallName(LC); + if (!LibcallName) + return SDValue(); + assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit()); // For MacOSX, we want to call an alternative entry point: __sincos_stret, // which returns the values as { float, float } (in XMM0) or // { double, double } (which is returned in XMM0, XMM1). SDLoc dl(Op); - SDValue Arg = Op.getOperand(0); - EVT ArgVT = Arg.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); TargetLowering::ArgListTy Args; Args.emplace_back(Arg, ArgTy); - bool isF64 = ArgVT == MVT::f64; // Only optimize x86_64 for now. i386 is a bit messy. For f32, // the small struct {f32, f32} is returned in (eax, edx). For f64, // the results are returned via SRet in memory. - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; - const char *LibcallName = TLI.getLibcallName(LC); SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); @@ -54634,6 +54635,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); SDValue Src = N->getOperand(0); + EVT SrcVT = Src.getValueType(); SDLoc DL(N); // Attempt to pre-truncate inputs to arithmetic ops instead. @@ -54652,6 +54654,40 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget)) return V; + // Fold trunc(srl(load(p),amt)) -> load(p+amt/8) + // If we're shifting down byte aligned bit chunks from a larger load for + // truncation, see if we can convert the shift into a pointer offset instead. + // Limit this to normal (non-ext) scalar integer loads. + if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL && + Src.hasOneUse() && Src.getOperand(0).hasOneUse() && + ISD::isNormalLoad(Src.getOperand(0).getNode())) { + auto *Ld = cast<LoadSDNode>(Src.getOperand(0)); + if (Ld->isSimple() && VT.isByteSized() && + isPowerOf2_64(VT.getSizeInBits())) { + SDValue ShAmt = Src.getOperand(1); + KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); + // Check the shift amount is byte aligned. + // Check the truncation doesn't use any shifted in (zero) top bits. + if (KnownAmt.countMinTrailingZeros() >= 3 && + KnownAmt.getMaxValue().ule(SrcVT.getSizeInBits() - + VT.getSizeInBits())) { + EVT PtrVT = Ld->getBasePtr().getValueType(); + SDValue PtrBitOfs = DAG.getZExtOrTrunc(ShAmt, DL, PtrVT); + SDValue PtrByteOfs = + DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs, + DAG.getShiftAmountConstant(3, PtrVT, DL)); + SDValue NewPtr = DAG.getMemBasePlusOffset( + Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap); + SDValue NewLoad = + DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(), + Align(), Ld->getMemOperand()->getFlags()); + DAG.ReplaceAllUsesOfValueWith(Src.getOperand(0).getValue(1), + NewLoad.getValue(1)); + return NewLoad; + } + } + } + // The bitcast source is a direct mmx result. // Detect bitcasts between i32 to x86mmx if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) { |
