diff options
Diffstat (limited to 'llvm/lib/Target')
46 files changed, 684 insertions, 1118 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 49d8b44..59cc1df 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -13,7 +13,6 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/Target/TargetMachine.h" diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index e3ca09e..6118933 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -391,8 +391,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, // Library functions. These default to Expand, but we have instructions // for them. setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR, - ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM}, - MVT::f32, Legal); + ISD::FROUNDEVEN, ISD::FTRUNC}, + {MVT::f16, MVT::f32}, Legal); + setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, MVT::f32, Legal); setOperationAction(ISD::FLOG2, MVT::f32, Custom); setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom); @@ -412,9 +413,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom); - if (Subtarget->has16BitInsts()) + if (Subtarget->has16BitInsts()) { setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal); - else { + setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal); + } else { setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal); setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom); } @@ -4844,94 +4846,11 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, return SDValue(); } -// Detect when CMP and SELECT use the same constant and fold them to avoid -// loading the constant twice. Specifically handles patterns like: -// %cmp = icmp eq i32 %val, 4242 -// %sel = select i1 %cmp, i32 4242, i32 %other -// It can be optimized to reuse %val instead of 4242 in select. -static SDValue -foldCmpSelectWithSharedConstant(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, - const AMDGPUSubtarget *ST) { - SDValue Cond = N->getOperand(0); - SDValue TrueVal = N->getOperand(1); - SDValue FalseVal = N->getOperand(2); - - // Check if condition is a comparison. - if (Cond.getOpcode() != ISD::SETCC) - return SDValue(); - - SDValue LHS = Cond.getOperand(0); - SDValue RHS = Cond.getOperand(1); - ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); - - bool isFloatingPoint = LHS.getValueType().isFloatingPoint(); - bool isInteger = LHS.getValueType().isInteger(); - - // Handle simple floating-point and integer types only. - if (!isFloatingPoint && !isInteger) - return SDValue(); - - bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ); - bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE); - if (!isEquality && !isNonEquality) - return SDValue(); - - SDValue ArgVal, ConstVal; - if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) || - (isInteger && isa<ConstantSDNode>(RHS))) { - ConstVal = RHS; - ArgVal = LHS; - } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) || - (isInteger && isa<ConstantSDNode>(LHS))) { - ConstVal = LHS; - ArgVal = RHS; - } else { - return SDValue(); - } - - // Check if constant should not be optimized - early return if not. - if (isFloatingPoint) { - const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF(); - const GCNSubtarget *GCNST = static_cast<const GCNSubtarget *>(ST); - - // Only optimize normal floating-point values (finite, non-zero, and - // non-subnormal as per IEEE 754), skip optimization for inlinable - // floating-point constants. - if (!Val.isNormal() || GCNST->getInstrInfo()->isInlineConstant(Val)) - return SDValue(); - } else { - int64_t IntVal = cast<ConstantSDNode>(ConstVal)->getSExtValue(); - - // Skip optimization for inlinable integer immediates. - // Inlinable immediates include: -16 to 64 (inclusive). - if (IntVal >= -16 && IntVal <= 64) - return SDValue(); - } - - // For equality and non-equality comparisons, patterns: - // select (setcc x, const), const, y -> select (setcc x, const), x, y - // select (setccinv x, const), y, const -> select (setccinv x, const), y, x - if (!(isEquality && TrueVal == ConstVal) && - !(isNonEquality && FalseVal == ConstVal)) - return SDValue(); - - SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal; - SDValue SelectRHS = - (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal; - return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond, - SelectLHS, SelectRHS); -} - SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) return Folded; - // Try to fold CMP + SELECT patterns with shared constants (both FP and - // integer). - if (SDValue Folded = foldCmpSelectWithSharedConstant(N, DCI, Subtarget)) - return Folded; - SDValue Cond = N->getOperand(0); if (Cond.getOpcode() != ISD::SETCC) return SDValue(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c865082..38f9ee5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -836,8 +836,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { // When we are not using -fgpu-rdc, we can run accelerator code // selection relatively early, but still after linking to prevent // eager removal of potentially reachable symbols. - if (EnableHipStdPar) + if (EnableHipStdPar) { + PM.addPass(HipStdParMathFixupPass()); PM.addPass(HipStdParAcceleratorCodeSelectionPass()); + } PM.addPass(AMDGPUPrintfRuntimeBindingPass()); } @@ -916,8 +918,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { // selection after linking to prevent, otherwise we end up removing // potentially reachable symbols that were exported as external in other // modules. - if (EnableHipStdPar) + if (EnableHipStdPar) { + PM.addPass(HipStdParMathFixupPass()); PM.addPass(HipStdParAcceleratorCodeSelectionPass()); + } // We want to support the -lto-partitions=N option as "best effort". // For that, we need to lower LDS earlier in the pipeline before the // module is partitioned for codegen. diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 7207c25..0f172e0d 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -369,31 +369,68 @@ multiclass FLAT_Global_Store_Pseudo_t16<string opName> { } } -class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo< +// Async loads, introduced in gfx1250, will store directly +// to a DS address in vdst (they will not use M0 for DS addess). +class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsync = 0> : FLAT_Pseudo< opName, (outs ), !con( - !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), - (ins flat_offset:$offset, CPol_0:$cpol)), - " $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { - let LGKM_CNT = 1; + !if(IsAsync, (ins VGPR_32:$vdst), (ins)), + !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), + (ins flat_offset:$offset, CPol_0:$cpol)), + !if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { + let LGKM_CNT = !not(IsAsync); + let VM_CNT = !not(IsAsync); + let ASYNC_CNT = IsAsync; let is_flat_global = 1; let lds = 1; let has_data = 0; + let has_vdst = IsAsync; // vdst for ds address with IsAsync + let mayLoad = 1; + let mayStore = 1; + let has_saddr = 1; + let enabled_saddr = EnableSaddr; + let VALU = 1; + let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", ""); + let Uses = !if(IsAsync, [EXEC, ASYNCcnt], [M0, EXEC]); + let Defs = !if(IsAsync, [ASYNCcnt], []); + let SchedRW = [WriteVMEM, WriteLDS]; +} + +multiclass FLAT_Global_Load_LDS_Pseudo<string opName, bit IsAsync = 0> { + def "" : FLAT_Global_Load_LDS_Pseudo<opName, 0, IsAsync>, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1, IsAsync>, + GlobalSaddrTable<1, opName>; +} + +class FLAT_Global_STORE_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo< + opName, + (outs ), + !con( + !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), (ins VGPR_32:$vdata), + (ins flat_offset:$offset, CPol_0:$cpol)), + " $vaddr, $vdata"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { + let VM_CNT = 0; + let ASYNC_CNT = 1; + let is_flat_global = 1; + let lds = 1; + let has_data = 1; // vdata for ds address let has_vdst = 0; let mayLoad = 1; let mayStore = 1; let has_saddr = 1; let enabled_saddr = EnableSaddr; let VALU = 1; - let Uses = [M0, EXEC]; + let Uses = [EXEC, ASYNCcnt]; + let Defs = [ASYNCcnt]; let SchedRW = [WriteVMEM, WriteLDS]; } -multiclass FLAT_Global_Load_LDS_Pseudo<string opName> { - def "" : FLAT_Global_Load_LDS_Pseudo<opName>, +multiclass FLAT_Global_STORE_LDS_Pseudo<string opName> { + def "" : FLAT_Global_STORE_LDS_Pseudo<opName>, GlobalSaddrTable<0, opName>; - def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1>, + def _SADDR : FLAT_Global_STORE_LDS_Pseudo<opName, 1>, GlobalSaddrTable<1, opName>; } @@ -1156,6 +1193,15 @@ let SubtargetPredicate = isGFX12Plus in { let SubtargetPredicate = isGFX1250Plus in { +defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b8", 1>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b32", 1>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b64", 1>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b128", 1>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b8">; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b32">; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b64">; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b128">; + def TENSOR_SAVE : FLAT_Global_Tensor_Pseudo<"tensor_save", 1>; def TENSOR_STOP : FLAT_Global_Tensor_Pseudo<"tensor_stop">; } // End SubtargetPredicate = isGFX1250Plus @@ -3374,6 +3420,15 @@ defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>; defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>; defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x5f>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x60>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x61>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x62>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x63>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x64>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x65>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x66>; + defm GLOBAL_LOAD_TR_B128_w32 : VFLAT_Real_AllAddr_gfx1250<0x057, "global_load_tr16_b128">; defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "global_load_tr8_b64">; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 9a2bab1..0a0a107 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -537,6 +537,63 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { return getMaxNumVGPRs(MF.getFunction()); } +std::pair<unsigned, unsigned> +GCNSubtarget::getMaxNumVectorRegs(const Function &F) const { + const unsigned MaxVectorRegs = getMaxNumVGPRs(F); + + unsigned MaxNumVGPRs = MaxVectorRegs; + unsigned MaxNumAGPRs = 0; + + // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically, + // a wave may have up to 512 total vector registers combining together both + // VGPRs and AGPRs. Hence, in an entry function without calls and without + // AGPRs used within it, it is possible to use the whole vector register + // budget for VGPRs. + // + // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split + // register file accordingly. + if (hasGFX90AInsts()) { + unsigned MinNumAGPRs = 0; + const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs(); + const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); + + const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u}; + + // TODO: The lower bound should probably force the number of required + // registers up, overriding amdgpu-waves-per-eu. + std::tie(MinNumAGPRs, MaxNumAGPRs) = + AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", DefaultNumAGPR, + /*OnlyFirstRequired=*/true); + + if (MinNumAGPRs == DefaultNumAGPR.first) { + // Default to splitting half the registers if AGPRs are required. + MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2; + } else { + // Align to accum_offset's allocation granularity. + MinNumAGPRs = alignTo(MinNumAGPRs, 4); + + MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs); + } + + // Clamp values to be inbounds of our limits, and ensure min <= max. + + MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs); + MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs); + + MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs); + MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs); + + assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs && + MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs && + "invalid register counts"); + } else if (hasMAIInsts()) { + // On gfx908 the number of AGPRs always equals the number of VGPRs. + MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs; + } + + return std::pair(MaxNumVGPRs, MaxNumAGPRs); +} + void GCNSubtarget::adjustSchedDependency( SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const { diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 88a269f..785ede3 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1667,6 +1667,10 @@ public: return getMaxNumVGPRs(F); } + /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number + /// of waves per execution unit required for the function \p MF. + std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const; + /// \returns Maximum number of VGPRs that meets number of waves per execution /// unit requirement for function \p MF, or number of VGPRs explicitly /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 8d51ec6..9017f4f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15896,6 +15896,78 @@ SDValue SITargetLowering::performClampCombine(SDNode *N, return SDValue(CSrc, 0); } +SDValue SITargetLowering::performSelectCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + + // Try to fold CMP + SELECT patterns with shared constants (both FP and + // integer). + // Detect when CMP and SELECT use the same constant and fold them to avoid + // loading the constant twice. Specifically handles patterns like: + // %cmp = icmp eq i32 %val, 4242 + // %sel = select i1 %cmp, i32 4242, i32 %other + // It can be optimized to reuse %val instead of 4242 in select. + SDValue Cond = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue FalseVal = N->getOperand(2); + + // Check if condition is a comparison. + if (Cond.getOpcode() != ISD::SETCC) + return SDValue(); + + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + + bool isFloatingPoint = LHS.getValueType().isFloatingPoint(); + bool isInteger = LHS.getValueType().isInteger(); + + // Handle simple floating-point and integer types only. + if (!isFloatingPoint && !isInteger) + return SDValue(); + + bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ); + bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE); + if (!isEquality && !isNonEquality) + return SDValue(); + + SDValue ArgVal, ConstVal; + if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) || + (isInteger && isa<ConstantSDNode>(RHS))) { + ConstVal = RHS; + ArgVal = LHS; + } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) || + (isInteger && isa<ConstantSDNode>(LHS))) { + ConstVal = LHS; + ArgVal = RHS; + } else { + return SDValue(); + } + + // Skip optimization for inlinable immediates. + if (isFloatingPoint) { + const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF(); + if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val)) + return SDValue(); + } else { + if (AMDGPU::isInlinableIntLiteral( + cast<ConstantSDNode>(ConstVal)->getSExtValue())) + return SDValue(); + } + + // For equality and non-equality comparisons, patterns: + // select (setcc x, const), const, y -> select (setcc x, const), x, y + // select (setccinv x, const), y, const -> select (setccinv x, const), y, x + if (!(isEquality && TrueVal == ConstVal) && + !(isNonEquality && FalseVal == ConstVal)) + return SDValue(); + + SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal; + SDValue SelectRHS = + (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal; + return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond, + SelectLHS, SelectRHS); +} + SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { @@ -15944,6 +16016,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performFMulCombine(N, DCI); case ISD::SETCC: return performSetCCCombine(N, DCI); + case ISD::SELECT: + if (auto Res = performSelectCombine(N, DCI)) + return Res; + break; case ISD::FMAXNUM: case ISD::FMINNUM: case ISD::FMAXNUM_IEEE: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index acf6158..dedd9ae 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -211,6 +211,7 @@ private: SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFPRoundCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const; unsigned getFusedOpcode(const SelectionDAG &DAG, diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index dd3f2fe..520c321 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -552,7 +552,7 @@ public: (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) { // FLAT and SCRATCH instructions may access scratch. Other VMEM // instructions do not. - if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst)) + if (TII->mayAccessScratchThroughFlat(Inst)) return SCRATCH_WRITE_ACCESS; return VMEM_WRITE_ACCESS; } @@ -565,7 +565,6 @@ public: bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; - bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; bool isVmemAccess(const MachineInstr &MI) const; bool generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, @@ -2160,32 +2159,6 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { return false; } -// This is a flat memory operation. Check to see if it has memory tokens for -// either scratch or FLAT. -bool SIInsertWaitcnts::mayAccessScratchThroughFlat( - const MachineInstr &MI) const { - assert(TII->isFLAT(MI)); - - // SCRATCH instructions always access scratch. - if (TII->isFLATScratch(MI)) - return true; - - // GLOBAL instructions never access scratch. - if (TII->isFLATGlobal(MI)) - return false; - - // If there are no memory operands then conservatively assume the flat - // operation may access scratch. - if (MI.memoperands_empty()) - return true; - - // See if any memory operand specifies an address space that involves scratch. - return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) { - unsigned AS = Memop->getAddrSpace(); - return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; - }); -} - bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const { return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) || (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode())); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 8d6c1d0..2aa6b4e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4249,6 +4249,32 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode); } +bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const { + if (!isFLAT(MI) || isFLATGlobal(MI)) + return false; + + // If scratch is not initialized, we can never access it. + if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init")) + return false; + + // SCRATCH instructions always access scratch. + if (isFLATScratch(MI)) + return true; + + // If there are no memory operands then conservatively assume the flat + // operation may access scratch. + if (MI.memoperands_empty()) + return true; + + // TODO (?): Does this need to be taught how to read noalias.addrspace ? + + // See if any memory operand specifies an address space that involves scratch. + return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) { + unsigned AS = Memop->getAddrSpace(); + return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; + }); +} + bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { // Skip the full operand and register alias search modifiesRegister // does. There's only a handful of instructions that touch this, it's only an diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 2ffb783..e042b59 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -678,6 +678,12 @@ public: return get(Opcode).TSFlags & SIInstrFlags::FLAT; } + /// \returns true for SCRATCH_ instructions, or FLAT_ instructions with + /// SCRATCH_ memory operands. + /// Conservatively correct; will return true if \p MI cannot be proven + /// to not hit scratch. + bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; + static bool isBlockLoadStore(uint16_t Opcode) { switch (Opcode) { case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 9f61bf8..9509199 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -351,6 +351,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF, MachineRegisterInfo &MRI = MF.getRegInfo(); BitVector ReservedRegs = TRI->getReservedRegs(MF); BitVector NonWwmAllocMask(TRI->getNumRegs()); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); // FIXME: MaxNumVGPRsForWwmAllocation might need to be adjusted in the future // to have a balanced allocation between WWM values and per-thread vector @@ -359,7 +360,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF, NumRegs = std::min(static_cast<unsigned>(MFI->getSGPRSpillVGPRs().size()), NumRegs); - auto [MaxNumVGPRs, MaxNumAGPRs] = TRI->getMaxNumVectorRegs(MF); + auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction()); // Try to use the highest available registers for now. Later after // vgpr-regalloc, they can be shifted to the lowest range. unsigned I = 0; @@ -376,7 +377,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF, // Reserve an arbitrary register and report the error. TRI->markSuperRegs(RegMask, AMDGPU::VGPR0); MF.getFunction().getContext().emitError( - "can't find enough VGPRs for wwm-regalloc"); + "cannot find enough VGPRs for wwm-regalloc"); } } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 0e8a420..f1262e11 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -321,7 +321,7 @@ public: bool IsNonTemporal, bool IsLastUse = false) const = 0; - virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const { + virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const { return false; }; @@ -602,7 +602,7 @@ public: bool IsVolatile, bool IsNonTemporal, bool IsLastUse) const override; - bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override; + bool finalizeStore(MachineInstr &MI, bool Atomic) const override; bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, @@ -2536,9 +2536,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( if (IsVolatile) { Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); - if (Op == SIMemOp::STORE) - Changed |= insertWaitsBeforeSystemScopeStore(MI); - // Ensure operation has completed at system scope to cause all volatile // operations to be visible outside the program in a global order. Do not // request cross address space as only the global address space can be @@ -2551,11 +2548,24 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( return Changed; } -bool SIGfx12CacheControl::expandSystemScopeStore( - MachineBasicBlock::iterator &MI) const { - MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); - if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS)) - return insertWaitsBeforeSystemScopeStore(MI); +bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { + MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol); + if (!CPol) + return false; + + const unsigned Scope = CPol->getImm() & CPol::SCOPE; + + // GFX12.0 only: Extra waits needed before system scope stores. + if (!ST.hasGFX1250Insts()) { + if (!Atomic && Scope == CPol::SCOPE_SYS) + return insertWaitsBeforeSystemScopeStore(MI); + return false; + } + + // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address + // space. + if (TII->mayAccessScratchThroughFlat(MI) && Scope == CPol::SCOPE_CU) + return setScope(MI, CPol::SCOPE_SE); return false; } @@ -2658,6 +2668,8 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, assert(!MI->mayLoad() && MI->mayStore()); bool Changed = false; + // FIXME: Necessary hack because iterator can lose track of the store. + MachineInstr &StoreMI = *MI; if (MOI.isAtomic()) { if (MOI.getOrdering() == AtomicOrdering::Monotonic || @@ -2674,6 +2686,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE); + Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true); return Changed; } @@ -2686,7 +2699,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, // GFX12 specific, scope(desired coherence domain in cache hierarchy) is // instruction field, do not confuse it with atomic scope. - Changed |= CC->expandSystemScopeStore(MI); + Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false); return Changed; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 84cfa87..f3acc5c 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -572,65 +572,6 @@ MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass); } -std::pair<unsigned, unsigned> -SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const { - const unsigned MaxVectorRegs = ST.getMaxNumVGPRs(MF); - - unsigned MaxNumVGPRs = MaxVectorRegs; - unsigned MaxNumAGPRs = 0; - - // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically, - // a wave may have up to 512 total vector registers combining together both - // VGPRs and AGPRs. Hence, in an entry function without calls and without - // AGPRs used within it, it is possible to use the whole vector register - // budget for VGPRs. - // - // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split - // register file accordingly. - if (ST.hasGFX90AInsts()) { - unsigned MinNumAGPRs = 0; - const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs(); - const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); - - const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u}; - - // TODO: Move this logic into subtarget on IR function - // - // TODO: The lower bound should probably force the number of required - // registers up, overriding amdgpu-waves-per-eu. - std::tie(MinNumAGPRs, MaxNumAGPRs) = AMDGPU::getIntegerPairAttribute( - MF.getFunction(), "amdgpu-agpr-alloc", DefaultNumAGPR, - /*OnlyFirstRequired=*/true); - - if (MinNumAGPRs == DefaultNumAGPR.first) { - // Default to splitting half the registers if AGPRs are required. - MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2; - } else { - // Align to accum_offset's allocation granularity. - MinNumAGPRs = alignTo(MinNumAGPRs, 4); - - MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs); - } - - // Clamp values to be inbounds of our limits, and ensure min <= max. - - MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs); - MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs); - - MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs); - MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs); - - assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs && - MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs && - "invalid register counts"); - } else if (ST.hasMAIInsts()) { - // On gfx908 the number of AGPRs always equals the number of VGPRs. - MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs; - } - - return std::pair(MaxNumVGPRs, MaxNumAGPRs); -} - BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::MODE); @@ -742,7 +683,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { // Reserve VGPRs/AGPRs. // - auto [MaxNumVGPRs, MaxNumAGPRs] = getMaxNumVectorRegs(MF); + auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction()); for (const TargetRegisterClass *RC : regclasses()) { if (RC->isBaseClass() && isVGPRClass(RC)) { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 0008e5f..5508f07 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -90,11 +90,6 @@ public: /// spilling is needed. MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; - /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number - /// of waves per execution unit required for the function \p MF. - std::pair<unsigned, unsigned> - getMaxNumVectorRegs(const MachineFunction &MF) const; - BitVector getReservedRegs(const MachineFunction &MF) const override; bool isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const override; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index b5b3cc9..83e63ac 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -732,7 +732,14 @@ bool isGenericAtomic(unsigned Opc) { } bool isAsyncStore(unsigned Opc) { - return false; // placeholder before async store implementation. + return Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_SADDR_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_SADDR_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_SADDR_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_SADDR_gfx1250; } bool isTensorStore(unsigned Opc) { diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 50217c3..9e4dbec 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -4261,8 +4261,7 @@ std::optional<unsigned> ARMBaseInstrInfo::getOperandLatencyImpl( // instructions). if (Latency > 0 && Subtarget.isThumb2()) { const MachineFunction *MF = DefMI.getParent()->getParent(); - // FIXME: Use Function::hasOptSize(). - if (MF->getFunction().hasFnAttribute(Attribute::OptimizeForSize)) + if (MF->getFunction().hasOptSize()) --Latency; } return Latency; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index fca5dff..066b392 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -370,6 +370,11 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::FMINNUM, VT, Legal); setOperationAction(ISD::FMAXNUM, VT, Legal); setOperationAction(ISD::FROUND, VT, Legal); + setOperationAction(ISD::FROUNDEVEN, VT, Legal); + setOperationAction(ISD::FRINT, VT, Legal); + setOperationAction(ISD::FTRUNC, VT, Legal); + setOperationAction(ISD::FFLOOR, VT, Legal); + setOperationAction(ISD::FCEIL, VT, Legal); setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom); setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); @@ -1507,6 +1512,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::FLOG2, MVT::f16, Promote); setOperationAction(ISD::FROUND, MVT::f16, Legal); + setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal); + setOperationAction(ISD::FTRUNC, MVT::f16, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal); + setOperationAction(ISD::FRINT, MVT::f16, Legal); + setOperationAction(ISD::FFLOOR, MVT::f16, Legal); + setOperationAction(ISD::FCEIL, MVT::f16, Legal); } if (Subtarget->hasNEON()) { diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp index c2c1bb4..0615ec9 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp @@ -24,8 +24,6 @@ AVRMCAsmInfo::AVRMCAsmInfo(const Triple &TT, const MCTargetOptions &Options) { CalleeSaveStackSlotSize = 2; CommentString = ";"; SeparatorString = "$"; - PrivateGlobalPrefix = ".L"; - PrivateLabelPrefix = ".L"; UsesELFSectionDirectiveForBSS = true; SupportsDebugInformation = true; } diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h index fab2713..1915fa8 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h @@ -14,7 +14,7 @@ #define LLVM_AVR_ASM_INFO_H #include "MCTargetDesc/AVRMCExpr.h" -#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAsmInfoELF.h" #include "llvm/MC/MCExpr.h" namespace llvm { @@ -22,7 +22,7 @@ namespace llvm { class Triple; /// Specifies the format of AVR assembly files. -class AVRMCAsmInfo : public MCAsmInfo { +class AVRMCAsmInfo : public MCAsmInfoELF { public: explicit AVRMCAsmInfo(const Triple &TT, const MCTargetOptions &Options); void printSpecifierExpr(raw_ostream &OS, diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp index a87b9a2..bed6bc9 100644 --- a/llvm/lib/Target/BPF/BTFDebug.cpp +++ b/llvm/lib/Target/BPF/BTFDebug.cpp @@ -957,47 +957,47 @@ void BTFDebug::visitMapDefType(const DIType *Ty, uint32_t &TypeId) { return; } - // MapDef type may be a struct type or a non-pointer derived type - const DIType *OrigTy = Ty; - while (auto *DTy = dyn_cast<DIDerivedType>(Ty)) { - auto Tag = DTy->getTag(); - if (Tag != dwarf::DW_TAG_typedef && Tag != dwarf::DW_TAG_const_type && - Tag != dwarf::DW_TAG_volatile_type && - Tag != dwarf::DW_TAG_restrict_type) - break; - Ty = DTy->getBaseType(); - } - - const auto *CTy = dyn_cast<DICompositeType>(Ty); - if (!CTy) - return; - - auto Tag = CTy->getTag(); - if (Tag != dwarf::DW_TAG_structure_type || CTy->isForwardDecl()) - return; - - // Visit all struct members to ensure their types are visited. - const DINodeArray Elements = CTy->getElements(); - for (const auto *Element : Elements) { - const auto *MemberType = cast<DIDerivedType>(Element); - const DIType *MemberBaseType = MemberType->getBaseType(); - - // If the member is a composite type, that may indicate the currently - // visited composite type is a wrapper, and the member represents the - // actual map definition. - // In that case, visit the member with `visitMapDefType` instead of - // `visitTypeEntry`, treating it specifically as a map definition rather - // than as a regular composite type. - const auto *MemberCTy = dyn_cast<DICompositeType>(MemberBaseType); - if (MemberCTy) { - visitMapDefType(MemberBaseType, TypeId); - } else { - visitTypeEntry(MemberBaseType); + uint32_t TmpId; + switch (Ty->getTag()) { + case dwarf::DW_TAG_typedef: + case dwarf::DW_TAG_const_type: + case dwarf::DW_TAG_volatile_type: + case dwarf::DW_TAG_restrict_type: + case dwarf::DW_TAG_pointer_type: + visitMapDefType(dyn_cast<DIDerivedType>(Ty)->getBaseType(), TmpId); + break; + case dwarf::DW_TAG_array_type: + // Visit nested map array and jump to the element type + visitMapDefType(dyn_cast<DICompositeType>(Ty)->getBaseType(), TmpId); + break; + case dwarf::DW_TAG_structure_type: { + // Visit all struct members to ensure their types are visited. + const auto *CTy = cast<DICompositeType>(Ty); + const DINodeArray Elements = CTy->getElements(); + for (const auto *Element : Elements) { + const auto *MemberType = cast<DIDerivedType>(Element); + const DIType *MemberBaseType = MemberType->getBaseType(); + // If the member is a composite type, that may indicate the currently + // visited composite type is a wrapper, and the member represents the + // actual map definition. + // In that case, visit the member with `visitMapDefType` instead of + // `visitTypeEntry`, treating it specifically as a map definition rather + // than as a regular composite type. + const auto *MemberCTy = dyn_cast<DICompositeType>(MemberBaseType); + if (MemberCTy) { + visitMapDefType(MemberBaseType, TmpId); + } else { + visitTypeEntry(MemberBaseType); + } } + break; + } + default: + break; } // Visit this type, struct or a const/typedef/volatile/restrict type - visitTypeEntry(OrigTy, TypeId, false, false); + visitTypeEntry(Ty, TypeId, false, false); } /// Read file contents from the actual file or from the source diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h index 7b21684..63d6e6f 100644 --- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h +++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h @@ -13,18 +13,19 @@ #ifndef LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCASMINFO_H #define LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCASMINFO_H -#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAsmInfoELF.h" #include "llvm/TargetParser/Triple.h" namespace llvm { -class BPFMCAsmInfo : public MCAsmInfo { +class BPFMCAsmInfo : public MCAsmInfoELF { public: explicit BPFMCAsmInfo(const Triple &TT, const MCTargetOptions &Options) { if (TT.getArch() == Triple::bpfeb) IsLittleEndian = false; PrivateGlobalPrefix = ".L"; + PrivateLabelPrefix = "L"; WeakRefDirective = "\t.weak\t"; UsesELFSectionDirectiveForBSS = true; diff --git a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp index c86fa2b..54c3cea 100644 --- a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp @@ -457,7 +457,7 @@ bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) { TII = ST->getInstrInfo(); const Function &F = MF.getFunction(); - bool OptForSize = F.hasFnAttribute(Attribute::OptimizeForSize); + bool OptForSize = F.hasOptSize(); // Combine aggressively (for code size) ShouldCombineAggressively = diff --git a/llvm/lib/Target/Hexagon/HexagonMask.cpp b/llvm/lib/Target/Hexagon/HexagonMask.cpp index 6eccf80..9d7776d 100644 --- a/llvm/lib/Target/Hexagon/HexagonMask.cpp +++ b/llvm/lib/Target/Hexagon/HexagonMask.cpp @@ -76,7 +76,7 @@ bool HexagonMask::runOnMachineFunction(MachineFunction &MF) { HII = HST.getInstrInfo(); const Function &F = MF.getFunction(); - if (!F.hasFnAttribute(Attribute::OptimizeForSize)) + if (!F.hasOptSize()) return false; // Mask instruction is available only from v66 if (!HST.hasV66Ops()) diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 613cfb5..d96136c 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -2385,13 +2385,6 @@ SDValue LoongArchTargetLowering::lowerBF16_TO_FP(SDValue Op, return Res; } -static bool isConstantBUILD_VECTOR(const BuildVectorSDNode *Op) { - for (unsigned i = 0; i < Op->getNumOperands(); ++i) - if (isIntOrFPConstant(Op->getOperand(i))) - return true; - return false; -} - // Lower BUILD_VECTOR as broadcast load (if possible). // For example: // %a = load i8, ptr %ptr @@ -2441,10 +2434,14 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { BuildVectorSDNode *Node = cast<BuildVectorSDNode>(Op); EVT ResTy = Op->getValueType(0); + unsigned NumElts = ResTy.getVectorNumElements(); SDLoc DL(Op); APInt SplatValue, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; + bool IsConstant = false; + bool UseSameConstant = true; + SDValue ConstantValue; bool Is128Vec = ResTy.is128BitVector(); bool Is256Vec = ResTy.is256BitVector(); @@ -2495,13 +2492,35 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op, if (DAG.isSplatValue(Op, /*AllowUndefs=*/false)) return Op; - if (!isConstantBUILD_VECTOR(Node)) { + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Opi = Node->getOperand(i); + if (isIntOrFPConstant(Opi)) { + IsConstant = true; + if (!ConstantValue.getNode()) + ConstantValue = Opi; + else if (ConstantValue != Opi) + UseSameConstant = false; + } + } + + // If the type of BUILD_VECTOR is v2f64, custom legalizing it has no benefits. + if (IsConstant && UseSameConstant && ResTy != MVT::v2f64) { + SDValue Result = DAG.getSplatBuildVector(ResTy, DL, ConstantValue); + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Opi = Node->getOperand(i); + if (!isIntOrFPConstant(Opi)) + Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Result, Opi, + DAG.getConstant(i, DL, Subtarget.getGRLenVT())); + } + return Result; + } + + if (!IsConstant) { // Use INSERT_VECTOR_ELT operations rather than expand to stores. // The resulting code is the same length as the expansion, but it doesn't // use memory operations. assert(ResTy.isVector()); - unsigned NumElts = ResTy.getVectorNumElements(); SDValue Op0 = Node->getOperand(0); SDValue Vector = DAG.getUNDEF(ResTy); diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index 8fa72bc..d9ea88c 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -254,6 +254,7 @@ bool LoongArchAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) { MCFixup Fixup = MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN); F.setVarFixups({Fixup}); + F.setLinkerRelaxable(); F.getParent()->setLinkerRelaxable(); return true; } diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index ca03310..a2e48ab 100644 --- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -737,14 +737,18 @@ void MipsAsmPrinter::emitStartOfAsmFile(Module &M) { if (FS.empty() && M.size() && F->hasFnAttribute("target-features")) FS = F->getFnAttribute("target-features").getValueAsString(); + std::string strFS = FS.str(); + if (M.size() && F->getFnAttribute("use-soft-float").getValueAsBool()) + strFS += strFS.empty() ? "+soft-float" : ",+soft-float"; + // Compute MIPS architecture attributes based on the default subtarget // that we'd have constructed. // FIXME: For ifunc related functions we could iterate over and look // for a feature string that doesn't match the default one. StringRef CPU = MIPS_MC::selectMipsCPU(TT, TM.getTargetCPU()); const MipsTargetMachine &MTM = static_cast<const MipsTargetMachine &>(TM); - const MipsSubtarget STI(TT, CPU, FS, MTM.isLittleEndian(), MTM, - std::nullopt); + const MipsSubtarget STI(TT, CPU, StringRef(strFS), MTM.isLittleEndian(), + MTM, std::nullopt); bool IsABICalls = STI.isABICalls(); const MipsABIInfo &ABI = MTM.getABI(); diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index 0e581a7..ec6b382 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -522,9 +522,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::TRAP, MVT::Other, Legal); - setOperationAction(ISD::ConstantFP, MVT::f32, Custom); - setOperationAction(ISD::ConstantFP, MVT::f64, Custom); - setTargetDAGCombine({ISD::SDIVREM, ISD::UDIVREM, ISD::SELECT, ISD::AND, ISD::OR, ISD::ADD, ISD::SUB, ISD::AssertZext, ISD::SHL, ISD::SIGN_EXTEND}); @@ -1360,8 +1357,6 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::FP_TO_SINT: return lowerFP_TO_SINT(Op, DAG); case ISD::READCYCLECOUNTER: return lowerREADCYCLECOUNTER(Op, DAG); - case ISD::ConstantFP: - return lowerConstantFP(Op, DAG); } return SDValue(); } @@ -3019,30 +3014,6 @@ SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op, return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op.getValueType(), Trunc); } -SDValue MipsTargetLowering::lowerConstantFP(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getSimpleValueType(); - SDNode *N = Op.getNode(); - ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(N); - - if (!CFP->isNaN() || Subtarget.isNaN2008()) { - return SDValue(); - } - - APFloat NaNValue = CFP->getValueAPF(); - auto &Sem = NaNValue.getSemantics(); - - // The MSB of the mantissa should be zero for QNaNs in the MIPS legacy NaN - // encodings, and one for sNaNs. Check every NaN constants and make sure - // they are correctly encoded for legacy encodings. - if (!NaNValue.isSignaling()) { - APFloat RealQNaN = NaNValue.getSNaN(Sem); - return DAG.getConstantFP(RealQNaN, DL, VT); - } - return SDValue(); -} - //===----------------------------------------------------------------------===// // Calling Convention Implementation //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/Mips/MipsISelLowering.h b/llvm/lib/Target/Mips/MipsISelLowering.h index 31ac5d4..c65c76c 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.h +++ b/llvm/lib/Target/Mips/MipsISelLowering.h @@ -592,7 +592,6 @@ class TargetRegisterClass; SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerConstantFP(SDValue Op, SelectionDAG &DAG) const; /// isEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp index 614b321..ce9cd12 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp @@ -15,8 +15,6 @@ using namespace llvm; -void NVPTXMCAsmInfo::anchor() {} - NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple, const MCTargetOptions &Options) { if (TheTriple.getArch() == Triple::nvptx64) { diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h index 77c4dae..f071406 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h @@ -19,8 +19,6 @@ namespace llvm { class Triple; class NVPTXMCAsmInfo : public MCAsmInfo { - virtual void anchor(); - public: explicit NVPTXMCAsmInfo(const Triple &TheTriple, const MCTargetOptions &Options); diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp index 9f91143..329e3b5 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp @@ -97,10 +97,7 @@ void NVPTXTargetStreamer::changeSection(const MCSection *CurSection, if (isDwarfSection(FI, Section)) { // Emit DWARF .file directives in the outermost scope. outputDwarfFileDirectives(); - OS << "\t.section"; - Section->printSwitchToSection(*getStreamer().getContext().getAsmInfo(), - getStreamer().getContext().getTargetTriple(), - OS, SubSection); + OS << "\t.section\t" << Section->getName() << '\n'; // DWARF sections are enclosed into braces - emit the open one. OS << "\t{\n"; HasSections = true; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 65e7c56..96f52275 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -145,18 +145,6 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) { if (tryStoreVector(N)) return; break; - case NVPTXISD::LoadParam: - case NVPTXISD::LoadParamV2: - case NVPTXISD::LoadParamV4: - if (tryLoadParam(N)) - return; - break; - case NVPTXISD::StoreParam: - case NVPTXISD::StoreParamV2: - case NVPTXISD::StoreParamV4: - if (tryStoreParam(N)) - return; - break; case ISD::INTRINSIC_W_CHAIN: if (tryIntrinsicChain(N)) return; @@ -1462,267 +1450,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { return true; } -bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) { - SDValue Chain = Node->getOperand(0); - SDValue Offset = Node->getOperand(2); - SDValue Glue = Node->getOperand(3); - SDLoc DL(Node); - MemSDNode *Mem = cast<MemSDNode>(Node); - - unsigned VecSize; - switch (Node->getOpcode()) { - default: - return false; - case NVPTXISD::LoadParam: - VecSize = 1; - break; - case NVPTXISD::LoadParamV2: - VecSize = 2; - break; - case NVPTXISD::LoadParamV4: - VecSize = 4; - break; - } - - EVT EltVT = Node->getValueType(0); - EVT MemVT = Mem->getMemoryVT(); - - std::optional<unsigned> Opcode; - - switch (VecSize) { - default: - return false; - case 1: - Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, - NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16, - NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64); - break; - case 2: - Opcode = - pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8, - NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32, - NVPTX::LoadParamMemV2I64); - break; - case 4: - Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, - NVPTX::LoadParamMemV4I8, NVPTX::LoadParamMemV4I16, - NVPTX::LoadParamMemV4I32, {/* no v4i64 */}); - break; - } - if (!Opcode) - return false; - - SDVTList VTs; - if (VecSize == 1) { - VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue); - } else if (VecSize == 2) { - VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue); - } else { - EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue }; - VTs = CurDAG->getVTList(EVTs); - } - - unsigned OffsetVal = Offset->getAsZExtVal(); - - SmallVector<SDValue, 2> Ops( - {CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue}); - - ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops)); - return true; -} - -// Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri) -#define getOpcV2H(ty, opKind0, opKind1) \ - NVPTX::StoreParamV2##ty##_##opKind0##opKind1 - -#define getOpcV2H1(ty, opKind0, isImm1) \ - (isImm1) ? getOpcV2H(ty, opKind0, i) : getOpcV2H(ty, opKind0, r) - -#define getOpcodeForVectorStParamV2(ty, isimm) \ - (isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1]) - -#define getOpcV4H(ty, opKind0, opKind1, opKind2, opKind3) \ - NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind3 - -#define getOpcV4H3(ty, opKind0, opKind1, opKind2, isImm3) \ - (isImm3) ? getOpcV4H(ty, opKind0, opKind1, opKind2, i) \ - : getOpcV4H(ty, opKind0, opKind1, opKind2, r) - -#define getOpcV4H2(ty, opKind0, opKind1, isImm2, isImm3) \ - (isImm2) ? getOpcV4H3(ty, opKind0, opKind1, i, isImm3) \ - : getOpcV4H3(ty, opKind0, opKind1, r, isImm3) - -#define getOpcV4H1(ty, opKind0, isImm1, isImm2, isImm3) \ - (isImm1) ? getOpcV4H2(ty, opKind0, i, isImm2, isImm3) \ - : getOpcV4H2(ty, opKind0, r, isImm2, isImm3) - -#define getOpcodeForVectorStParamV4(ty, isimm) \ - (isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3]) \ - : getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3]) - -#define getOpcodeForVectorStParam(n, ty, isimm) \ - (n == 2) ? getOpcodeForVectorStParamV2(ty, isimm) \ - : getOpcodeForVectorStParamV4(ty, isimm) - -static unsigned pickOpcodeForVectorStParam(SmallVector<SDValue, 8> &Ops, - unsigned NumElts, - MVT::SimpleValueType MemTy, - SelectionDAG *CurDAG, SDLoc DL) { - // Determine which inputs are registers and immediates make new operators - // with constant values - SmallVector<bool, 4> IsImm(NumElts, false); - for (unsigned i = 0; i < NumElts; i++) { - IsImm[i] = (isa<ConstantSDNode>(Ops[i]) || isa<ConstantFPSDNode>(Ops[i])); - if (IsImm[i]) { - SDValue Imm = Ops[i]; - if (MemTy == MVT::f32 || MemTy == MVT::f64) { - const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm); - const ConstantFP *CF = ConstImm->getConstantFPValue(); - Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0)); - } else { - const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm); - const ConstantInt *CI = ConstImm->getConstantIntValue(); - Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0)); - } - Ops[i] = Imm; - } - } - - // Get opcode for MemTy, size, and register/immediate operand ordering - switch (MemTy) { - case MVT::i8: - return getOpcodeForVectorStParam(NumElts, I8, IsImm); - case MVT::i16: - return getOpcodeForVectorStParam(NumElts, I16, IsImm); - case MVT::i32: - return getOpcodeForVectorStParam(NumElts, I32, IsImm); - case MVT::i64: - assert(NumElts == 2 && "MVT too large for NumElts > 2"); - return getOpcodeForVectorStParamV2(I64, IsImm); - case MVT::f32: - return getOpcodeForVectorStParam(NumElts, F32, IsImm); - case MVT::f64: - assert(NumElts == 2 && "MVT too large for NumElts > 2"); - return getOpcodeForVectorStParamV2(F64, IsImm); - - // These cases don't support immediates, just use the all register version - // and generate moves. - case MVT::i1: - return (NumElts == 2) ? NVPTX::StoreParamV2I8_rr - : NVPTX::StoreParamV4I8_rrrr; - case MVT::f16: - case MVT::bf16: - return (NumElts == 2) ? NVPTX::StoreParamV2I16_rr - : NVPTX::StoreParamV4I16_rrrr; - case MVT::v2f16: - case MVT::v2bf16: - case MVT::v2i16: - case MVT::v4i8: - return (NumElts == 2) ? NVPTX::StoreParamV2I32_rr - : NVPTX::StoreParamV4I32_rrrr; - default: - llvm_unreachable("Cannot select st.param for unknown MemTy"); - } -} - -bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { - SDLoc DL(N); - SDValue Chain = N->getOperand(0); - SDValue Param = N->getOperand(1); - unsigned ParamVal = Param->getAsZExtVal(); - SDValue Offset = N->getOperand(2); - unsigned OffsetVal = Offset->getAsZExtVal(); - MemSDNode *Mem = cast<MemSDNode>(N); - SDValue Glue = N->getOperand(N->getNumOperands() - 1); - - // How many elements do we have? - unsigned NumElts; - switch (N->getOpcode()) { - default: - llvm_unreachable("Unexpected opcode"); - case NVPTXISD::StoreParam: - NumElts = 1; - break; - case NVPTXISD::StoreParamV2: - NumElts = 2; - break; - case NVPTXISD::StoreParamV4: - NumElts = 4; - break; - } - - // Build vector of operands - SmallVector<SDValue, 8> Ops; - for (unsigned i = 0; i < NumElts; ++i) - Ops.push_back(N->getOperand(i + 3)); - Ops.append({CurDAG->getTargetConstant(ParamVal, DL, MVT::i32), - CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue}); - - // Determine target opcode - // If we have an i1, use an 8-bit store. The lowering code in - // NVPTXISelLowering will have already emitted an upcast. - std::optional<unsigned> Opcode; - switch (NumElts) { - default: - llvm_unreachable("Unexpected NumElts"); - case 1: { - MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy; - SDValue Imm = Ops[0]; - if (MemTy != MVT::f16 && MemTy != MVT::bf16 && - (isa<ConstantSDNode>(Imm) || isa<ConstantFPSDNode>(Imm))) { - // Convert immediate to target constant - if (MemTy == MVT::f32 || MemTy == MVT::f64) { - const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm); - const ConstantFP *CF = ConstImm->getConstantFPValue(); - Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0)); - } else { - const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm); - const ConstantInt *CI = ConstImm->getConstantIntValue(); - Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0)); - } - Ops[0] = Imm; - // Use immediate version of store param - Opcode = - pickOpcodeForVT(MemTy, NVPTX::StoreParamI8_i, NVPTX::StoreParamI16_i, - NVPTX::StoreParamI32_i, NVPTX::StoreParamI64_i); - } else - Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, - NVPTX::StoreParamI8_r, NVPTX::StoreParamI16_r, - NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r); - if (Opcode == NVPTX::StoreParamI8_r) { - // Fine tune the opcode depending on the size of the operand. - // This helps to avoid creating redundant COPY instructions in - // InstrEmitter::AddRegisterOperand(). - switch (Ops[0].getSimpleValueType().SimpleTy) { - default: - break; - case MVT::i32: - Opcode = NVPTX::StoreParamI8TruncI32_r; - break; - case MVT::i64: - Opcode = NVPTX::StoreParamI8TruncI64_r; - break; - } - } - break; - } - case 2: - case 4: { - MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy; - Opcode = pickOpcodeForVectorStParam(Ops, NumElts, MemTy, CurDAG, DL); - break; - } - } - - SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue); - SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops); - MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); - CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef}); - - ReplaceNode(N, Ret); - return true; -} - /// SelectBFE - Look for instruction sequences that can be made more efficient /// by using the 'bfe' (bit-field extract) PTX instruction bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index b99b4ef..e504a8f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -78,8 +78,6 @@ private: bool tryLDG(MemSDNode *N); bool tryStore(SDNode *N); bool tryStoreVector(SDNode *N); - bool tryLoadParam(SDNode *N); - bool tryStoreParam(SDNode *N); bool tryFence(SDNode *N); void SelectAddrSpaceCast(SDNode *N); bool tryBFE(SDNode *N); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index ddcecc00..f79b862 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1075,12 +1075,6 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(NVPTXISD::DeclareArrayParam) MAKE_CASE(NVPTXISD::DeclareScalarParam) MAKE_CASE(NVPTXISD::CALL) - MAKE_CASE(NVPTXISD::LoadParam) - MAKE_CASE(NVPTXISD::LoadParamV2) - MAKE_CASE(NVPTXISD::LoadParamV4) - MAKE_CASE(NVPTXISD::StoreParam) - MAKE_CASE(NVPTXISD::StoreParamV2) - MAKE_CASE(NVPTXISD::StoreParamV4) MAKE_CASE(NVPTXISD::MoveParam) MAKE_CASE(NVPTXISD::UNPACK_VECTOR) MAKE_CASE(NVPTXISD::BUILD_VECTOR) @@ -1318,105 +1312,6 @@ Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty, return DL.getABITypeAlign(Ty); } -static bool adjustElementType(EVT &ElementType) { - switch (ElementType.getSimpleVT().SimpleTy) { - default: - return false; - case MVT::f16: - case MVT::bf16: - ElementType = MVT::i16; - return true; - case MVT::f32: - case MVT::v2f16: - case MVT::v2bf16: - ElementType = MVT::i32; - return true; - case MVT::f64: - ElementType = MVT::i64; - return true; - } -} - -// Use byte-store when the param address of the argument value is unaligned. -// This may happen when the return value is a field of a packed structure. -// -// This is called in LowerCall() when passing the param values. -static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain, - uint64_t Offset, EVT ElementType, - SDValue StVal, SDValue &InGlue, - unsigned ArgID, const SDLoc &dl) { - // Bit logic only works on integer types - if (adjustElementType(ElementType)) - StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal); - - // Store each byte - SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue); - for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { - // Shift the byte to the last byte position - SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal, - DAG.getConstant(i * 8, dl, MVT::i32)); - SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32), - DAG.getConstant(Offset + i, dl, MVT::i32), - ShiftVal, InGlue}; - // Trunc store only the last byte by using - // st.param.b8 - // The register type can be larger than b8. - Chain = DAG.getMemIntrinsicNode( - NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8, - MachinePointerInfo(), Align(1), MachineMemOperand::MOStore); - InGlue = Chain.getValue(1); - } - return Chain; -} - -// Use byte-load when the param adress of the returned value is unaligned. -// This may happen when the returned value is a field of a packed structure. -static SDValue -LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset, - EVT ElementType, SDValue &InGlue, - SmallVectorImpl<SDValue> &TempProxyRegOps, - const SDLoc &dl) { - // Bit logic only works on integer types - EVT MergedType = ElementType; - adjustElementType(MergedType); - - // Load each byte and construct the whole value. Initial value to 0 - SDValue RetVal = DAG.getConstant(0, dl, MergedType); - // LoadParamMemI8 loads into i16 register only - SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue); - for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { - SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32), - DAG.getConstant(Offset + i, dl, MVT::i32), - InGlue}; - // This will be selected to LoadParamMemI8 - SDValue LdVal = - DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands, - MVT::i8, MachinePointerInfo(), Align(1)); - SDValue TmpLdVal = LdVal.getValue(0); - Chain = LdVal.getValue(1); - InGlue = LdVal.getValue(2); - - TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl, - TmpLdVal.getSimpleValueType(), TmpLdVal); - TempProxyRegOps.push_back(TmpLdVal); - - SDValue CMask = DAG.getConstant(255, dl, MergedType); - SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32); - // Need to extend the i16 register to the whole width. - TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal); - // Mask off the high bits. Leave only the lower 8bits. - // Do this because we are using loadparam.b8. - TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask); - // Shift and merge - TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift); - RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal); - } - if (ElementType != MergedType) - RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal); - - return RetVal; -} - static bool shouldConvertToIndirectCall(const CallBase *CB, const GlobalAddressSDNode *Func) { if (!Func) @@ -1483,10 +1378,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SelectionDAG &DAG = CLI.DAG; SDLoc dl = CLI.DL; - SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; - SDValue Chain = CLI.Chain; + const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; SDValue Callee = CLI.Callee; - bool &isTailCall = CLI.IsTailCall; ArgListTy &Args = CLI.getArgs(); Type *RetTy = CLI.RetTy; const CallBase *CB = CLI.CB; @@ -1496,6 +1389,36 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, return DAG.getConstant(I, dl, MVT::i32); }; + const unsigned UniqueCallSite = GlobalUniqueCallSite++; + const SDValue CallChain = CLI.Chain; + const SDValue StartChain = + DAG.getCALLSEQ_START(CallChain, UniqueCallSite, 0, dl); + SDValue DeclareGlue = StartChain.getValue(1); + + SmallVector<SDValue, 16> CallPrereqs{StartChain}; + + const auto MakeDeclareScalarParam = [&](SDValue Symbol, unsigned Size) { + // PTX ABI requires integral types to be at least 32 bits in size. FP16 is + // loaded/stored using i16, so it's handled here as well. + const unsigned SizeBits = promoteScalarArgumentSize(Size * 8); + SDValue Declare = + DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue}, + {StartChain, Symbol, GetI32(SizeBits), DeclareGlue}); + CallPrereqs.push_back(Declare); + DeclareGlue = Declare.getValue(1); + return Declare; + }; + + const auto MakeDeclareArrayParam = [&](SDValue Symbol, Align Align, + unsigned Size) { + SDValue Declare = DAG.getNode( + NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue}, + {StartChain, Symbol, GetI32(Align.value()), GetI32(Size), DeclareGlue}); + CallPrereqs.push_back(Declare); + DeclareGlue = Declare.getValue(1); + return Declare; + }; + // Variadic arguments. // // Normally, for each argument, we declare a param scalar or a param @@ -1511,15 +1434,17 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // // After all vararg is processed, 'VAOffset' holds the size of the // vararg byte array. + assert((CLI.IsVarArg || CLI.Args.size() == CLI.NumFixedArgs) && + "Non-VarArg function with extra arguments"); - SDValue VADeclareParam; // vararg byte array const unsigned FirstVAArg = CLI.NumFixedArgs; // position of first variadic - unsigned VAOffset = 0; // current offset in the param array + unsigned VAOffset = 0; // current offset in the param array - const unsigned UniqueCallSite = GlobalUniqueCallSite++; - SDValue TempChain = Chain; - Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl); - SDValue InGlue = Chain.getValue(1); + const SDValue VADeclareParam = + CLI.Args.size() > FirstVAArg + ? MakeDeclareArrayParam(getCallParamSymbol(DAG, FirstVAArg, MVT::i32), + Align(STI.getMaxRequiredAlignment()), 0) + : SDValue(); // Args.size() and Outs.size() need not match. // Outs.size() will be larger @@ -1580,43 +1505,19 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert((!IsByVal || TypeSize == ArgOuts[0].Flags.getByValSize()) && "type size mismatch"); - const std::optional<SDValue> ArgDeclare = [&]() -> std::optional<SDValue> { - if (IsVAArg) { - if (ArgI == FirstVAArg) { - VADeclareParam = DAG.getNode( - NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue}, - {Chain, ParamSymbol, GetI32(STI.getMaxRequiredAlignment()), - GetI32(0), InGlue}); - return VADeclareParam; - } - return std::nullopt; - } - if (IsByVal || shouldPassAsArray(Arg.Ty)) { - // declare .param .align <align> .b8 .param<n>[<size>]; - return DAG.getNode(NVPTXISD::DeclareArrayParam, dl, - {MVT::Other, MVT::Glue}, - {Chain, ParamSymbol, GetI32(ArgAlign.value()), - GetI32(TypeSize), InGlue}); - } + const SDValue ArgDeclare = [&]() { + if (IsVAArg) + return VADeclareParam; + + if (IsByVal || shouldPassAsArray(Arg.Ty)) + return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TypeSize); + assert(ArgOuts.size() == 1 && "We must pass only one value as non-array"); - // declare .param .b<size> .param<n>; - - // PTX ABI requires integral types to be at least 32 bits in - // size. FP16 is loaded/stored using i16, so it's handled - // here as well. - const unsigned PromotedSize = - (ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) - ? promoteScalarArgumentSize(TypeSize * 8) - : TypeSize * 8; - - return DAG.getNode(NVPTXISD::DeclareScalarParam, dl, - {MVT::Other, MVT::Glue}, - {Chain, ParamSymbol, GetI32(PromotedSize), InGlue}); + assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) && + "Only int and float types are supported as non-array arguments"); + + return MakeDeclareScalarParam(ParamSymbol, TypeSize); }(); - if (ArgDeclare) { - Chain = ArgDeclare->getValue(0); - InGlue = ArgDeclare->getValue(1); - } // PTX Interoperability Guide 3.3(A): [Integer] Values shorter // than 32-bits are sign extended or zero extended, depending on @@ -1626,36 +1527,25 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32; const auto GetStoredValue = [&](const unsigned I, EVT EltVT, - const Align PartAlign) { - SDValue StVal; + const MaybeAlign PartAlign) { if (IsByVal) { SDValue Ptr = ArgOutVals[0]; auto MPI = refinePtrAS(Ptr, DAG, DL, *this); SDValue SrcAddr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(Offsets[I])); - StVal = DAG.getLoad(EltVT, dl, TempChain, SrcAddr, MPI, PartAlign); - } else { - StVal = ArgOutVals[I]; - - auto PromotedVT = promoteScalarIntegerPTX(StVal.getValueType()); - if (PromotedVT != StVal.getValueType()) { - StVal = DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, PromotedVT, - StVal); - } + return DAG.getLoad(EltVT, dl, CallChain, SrcAddr, MPI, PartAlign); } + SDValue StVal = ArgOutVals[I]; + assert(promoteScalarIntegerPTX(StVal.getValueType()) == + StVal.getValueType() && + "OutVal type should always be legal"); - if (ExtendIntegerParam) { - assert(VTs.size() == 1 && "Scalar can't have multiple parts."); - // zext/sext to i32 - StVal = - DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, MVT::i32, StVal); - } else if (EltVT.getSizeInBits() < 16) { - // Use 16-bit registers for small stores as it's the - // smallest general purpose register size supported by NVPTX. - StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); - } - return StVal; + const EVT VTI = promoteScalarIntegerPTX(VTs[I]); + const EVT StoreVT = + ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI); + + return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl); }; const auto VectorInfo = @@ -1664,23 +1554,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned J = 0; for (const unsigned NumElts : VectorInfo) { const int CurOffset = Offsets[J]; - EVT EltVT = promoteScalarIntegerPTX(VTs[J]); - const Align PartAlign = commonAlignment(ArgAlign, CurOffset); - - // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a - // scalar store. In such cases, fall back to byte stores. - if (NumElts == 1 && !IsVAArg && PartAlign < DAG.getEVTAlign(EltVT)) { - - SDValue StVal = GetStoredValue(J, EltVT, PartAlign); - Chain = LowerUnalignedStoreParam(DAG, Chain, - CurOffset + (IsByVal ? VAOffset : 0), - EltVT, StVal, InGlue, ArgI, dl); - - // LowerUnalignedStoreParam took care of inserting the necessary nodes - // into the SDAG, so just move on to the next element. - J++; - continue; - } + const EVT EltVT = promoteScalarIntegerPTX(VTs[J]); if (IsVAArg && !IsByVal) // Align each part of the variadic argument to their type. @@ -1688,44 +1562,45 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert((IsVAArg || VAOffset == 0) && "VAOffset must be 0 for non-VA args"); - SmallVector<SDValue, 6> StoreOperands{ - Chain, GetI32(IsVAArg ? FirstVAArg : ArgI), - GetI32(VAOffset + ((IsVAArg && !IsByVal) ? 0 : CurOffset))}; - // Record the values to store. - for (const unsigned K : llvm::seq(NumElts)) - StoreOperands.push_back(GetStoredValue(J + K, EltVT, PartAlign)); - StoreOperands.push_back(InGlue); + const unsigned Offset = + (VAOffset + ((IsVAArg && !IsByVal) ? 0 : CurOffset)); + SDValue Ptr = + DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset)); - NVPTXISD::NodeType Op; - switch (NumElts) { - case 1: - Op = NVPTXISD::StoreParam; - break; - case 2: - Op = NVPTXISD::StoreParamV2; - break; - case 4: - Op = NVPTXISD::StoreParamV4; - break; - default: - llvm_unreachable("Invalid vector info."); + const MaybeAlign CurrentAlign = ExtendIntegerParam + ? MaybeAlign(std::nullopt) + : commonAlignment(ArgAlign, Offset); + + SDValue Val; + if (NumElts == 1) { + Val = GetStoredValue(J, EltVT, CurrentAlign); + } else { + SmallVector<SDValue, 8> StoreVals; + for (const unsigned K : llvm::seq(NumElts)) { + SDValue ValJ = GetStoredValue(J + K, EltVT, CurrentAlign); + if (ValJ.getValueType().isVector()) + DAG.ExtractVectorElements(ValJ, StoreVals); + else + StoreVals.push_back(ValJ); + } + + EVT VT = EVT::getVectorVT( + *DAG.getContext(), StoreVals[0].getValueType(), StoreVals.size()); + Val = DAG.getBuildVector(VT, dl, StoreVals); } - // Adjust type of the store op if we've extended the scalar - // return value. - EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT; - Chain = DAG.getMemIntrinsicNode( - Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, - TheStoreType, MachinePointerInfo(), PartAlign, - MachineMemOperand::MOStore); - InGlue = Chain.getValue(1); + SDValue StoreParam = + DAG.getStore(ArgDeclare, dl, Val, Ptr, + MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign); + CallPrereqs.push_back(StoreParam); // TODO: We may need to support vector types that can be passed // as scalars in variadic arguments. if (IsVAArg && !IsByVal) { assert(NumElts == 1 && "Vectorization is expected to be disabled for variadics."); + const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT; VAOffset += DL.getTypeAllocSize(TheStoreType.getTypeForEVT(*DAG.getContext())); } @@ -1736,33 +1611,21 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, VAOffset += TypeSize; } - GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); - // Handle Result if (!Ins.empty()) { - const SDValue RetDeclare = [&]() { - const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32); - const unsigned ResultSize = DL.getTypeAllocSizeInBits(RetTy); - if (shouldPassAsArray(RetTy)) { - const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL); - return DAG.getNode(NVPTXISD::DeclareArrayParam, dl, - {MVT::Other, MVT::Glue}, - {Chain, RetSymbol, GetI32(RetAlign.value()), - GetI32(ResultSize / 8), InGlue}); - } - const auto PromotedResultSize = promoteScalarArgumentSize(ResultSize); - return DAG.getNode( - NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue}, - {Chain, RetSymbol, GetI32(PromotedResultSize), InGlue}); - }(); - Chain = RetDeclare.getValue(0); - InGlue = RetDeclare.getValue(1); + const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32); + const unsigned ResultSize = DL.getTypeAllocSize(RetTy); + if (shouldPassAsArray(RetTy)) { + const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL); + MakeDeclareArrayParam(RetSymbol, RetAlign, ResultSize); + } else { + MakeDeclareScalarParam(RetSymbol, ResultSize); + } } - const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs); // Set the size of the vararg param byte array if the callee is a variadic // function and the variadic part is not empty. - if (HasVAArgs) { + if (VADeclareParam) { SDValue DeclareParamOps[] = {VADeclareParam.getOperand(0), VADeclareParam.getOperand(1), VADeclareParam.getOperand(2), GetI32(VAOffset), @@ -1771,6 +1634,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, VADeclareParam->getVTList(), DeclareParamOps); } + const auto *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); // If the type of the callsite does not match that of the function, convert // the callsite to an indirect call. const bool ConvertToIndirectCall = shouldConvertToIndirectCall(CB, Func); @@ -1800,15 +1664,16 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // instruction. // The prototype is embedded in a string and put as the operand for a // CallPrototype SDNode which will print out to the value of the string. + const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs); std::string Proto = getPrototype(DL, RetTy, Args, CLI.Outs, HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB, UniqueCallSite); const char *ProtoStr = nvTM->getStrPool().save(Proto).data(); - Chain = DAG.getNode( - NVPTXISD::CallPrototype, dl, {MVT::Other, MVT::Glue}, - {Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InGlue}); - InGlue = Chain.getValue(1); + const SDValue PrototypeDeclare = DAG.getNode( + NVPTXISD::CallPrototype, dl, MVT::Other, + {StartChain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32)}); + CallPrereqs.push_back(PrototypeDeclare); } if (ConvertToIndirectCall) { @@ -1826,24 +1691,15 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const unsigned NumArgs = std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size()); /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns, - /// NumParams, Callee, Proto, InGlue) - Chain = DAG.getNode(NVPTXISD::CALL, dl, {MVT::Other, MVT::Glue}, - {Chain, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall), - GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, - GetI32(Proto), InGlue}); - InGlue = Chain.getValue(1); - + /// NumParams, Callee, Proto) + const SDValue CallToken = DAG.getTokenFactor(dl, CallPrereqs); + const SDValue Call = DAG.getNode( + NVPTXISD::CALL, dl, MVT::Other, + {CallToken, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall), + GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, GetI32(Proto)}); + + SmallVector<SDValue, 16> LoadChains{Call}; SmallVector<SDValue, 16> ProxyRegOps; - // An item of the vector is filled if the element does not need a ProxyReg - // operation on it and should be added to InVals as is. ProxyRegOps and - // ProxyRegTruncates contain empty/none items at the same index. - SmallVector<SDValue, 16> RetElts; - // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()` - // to use the values of `LoadParam`s and to be replaced later then - // `CALLSEQ_END` is added. - SmallVector<SDValue, 16> TempProxyRegOps; - - // Generate loads from param memory/moves from registers for result if (!Ins.empty()) { SmallVector<EVT, 16> VTs; SmallVector<uint64_t, 16> Offsets; @@ -1860,104 +1716,65 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); unsigned I = 0; - for (const unsigned VectorizedSize : VectorInfo) { - EVT TheLoadType = promoteScalarIntegerPTX(VTs[I]); - EVT EltType = Ins[I].VT; - const Align EltAlign = commonAlignment(RetAlign, Offsets[I]); - - if (TheLoadType != VTs[I]) - EltType = TheLoadType; - - if (ExtendIntegerRetVal) { - TheLoadType = MVT::i32; - EltType = MVT::i32; - } else if (TheLoadType.getSizeInBits() < 16) { - EltType = MVT::i16; - } + for (const unsigned NumElts : VectorInfo) { + const MaybeAlign CurrentAlign = + ExtendIntegerRetVal ? MaybeAlign(std::nullopt) + : commonAlignment(RetAlign, Offsets[I]); - // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a - // scalar load. In such cases, fall back to byte loads. - if (VectorizedSize == 1 && RetTy->isAggregateType() && - EltAlign < DAG.getEVTAlign(TheLoadType)) { - SDValue Ret = LowerUnalignedLoadRetParam( - DAG, Chain, Offsets[I], TheLoadType, InGlue, TempProxyRegOps, dl); - ProxyRegOps.push_back(SDValue()); - RetElts.resize(I); - RetElts.push_back(Ret); - - I++; - continue; - } + const EVT VTI = promoteScalarIntegerPTX(VTs[I]); + const EVT LoadVT = + ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI); - SmallVector<EVT, 6> LoadVTs(VectorizedSize, EltType); - LoadVTs.append({MVT::Other, MVT::Glue}); + const unsigned PackingAmt = + LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1; - NVPTXISD::NodeType Op; - switch (VectorizedSize) { - case 1: - Op = NVPTXISD::LoadParam; - break; - case 2: - Op = NVPTXISD::LoadParamV2; - break; - case 4: - Op = NVPTXISD::LoadParamV4; - break; - default: - llvm_unreachable("Invalid vector info."); - } + const EVT VecVT = NumElts == 1 ? LoadVT + : EVT::getVectorVT(*DAG.getContext(), + LoadVT.getScalarType(), + NumElts * PackingAmt); - SDValue LoadOperands[] = {Chain, GetI32(1), GetI32(Offsets[I]), InGlue}; - SDValue RetVal = DAG.getMemIntrinsicNode( - Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType, - MachinePointerInfo(), EltAlign, MachineMemOperand::MOLoad); + const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32); + SDValue Ptr = + DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I])); - for (const unsigned J : llvm::seq(VectorizedSize)) { - ProxyRegOps.push_back(RetVal.getValue(J)); - } + SDValue R = + DAG.getLoad(VecVT, dl, Call, Ptr, + MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign); - Chain = RetVal.getValue(VectorizedSize); - InGlue = RetVal.getValue(VectorizedSize + 1); + LoadChains.push_back(R.getValue(1)); - I += VectorizedSize; + if (NumElts == 1) + ProxyRegOps.push_back(R); + else + for (const unsigned J : llvm::seq(NumElts)) { + SDValue Elt = DAG.getNode( + LoadVT.isVector() ? ISD::EXTRACT_SUBVECTOR + : ISD::EXTRACT_VECTOR_ELT, + dl, LoadVT, R, DAG.getVectorIdxConstant(J * PackingAmt, dl)); + ProxyRegOps.push_back(Elt); + } + I += NumElts; } } - Chain = - DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl); - InGlue = Chain.getValue(1); + const SDValue EndToken = DAG.getTokenFactor(dl, LoadChains); + const SDValue CallEnd = DAG.getCALLSEQ_END(EndToken, UniqueCallSite, + UniqueCallSite + 1, SDValue(), dl); // Append ProxyReg instructions to the chain to make sure that `callseq_end` // will not get lost. Otherwise, during libcalls expansion, the nodes can become // dangling. - for (const unsigned I : llvm::seq(ProxyRegOps.size())) { - if (I < RetElts.size() && RetElts[I]) { - InVals.push_back(RetElts[I]); - continue; - } - - SDValue Ret = - DAG.getNode(NVPTXISD::ProxyReg, dl, ProxyRegOps[I].getSimpleValueType(), - {Chain, ProxyRegOps[I]}); - - const EVT ExpectedVT = Ins[I].VT; - if (!Ret.getValueType().bitsEq(ExpectedVT)) { - Ret = DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, Ret); - } + for (const auto [I, Reg] : llvm::enumerate(ProxyRegOps)) { + SDValue Proxy = + DAG.getNode(NVPTXISD::ProxyReg, dl, Reg.getValueType(), {CallEnd, Reg}); + SDValue Ret = correctParamType(Proxy, Ins[I].VT, Ins[I].Flags, DAG, dl); InVals.push_back(Ret); } - for (SDValue &T : TempProxyRegOps) { - SDValue Repl = DAG.getNode(NVPTXISD::ProxyReg, dl, T.getSimpleValueType(), - {Chain, T.getOperand(0)}); - DAG.ReplaceAllUsesWith(T, Repl); - DAG.RemoveDeadNode(T.getNode()); - } - - // set isTailCall to false for now, until we figure out how to express + // set IsTailCall to false for now, until we figure out how to express // tail call optimization in PTX - isTailCall = false; - return Chain; + CLI.IsTailCall = false; + return CallEnd; } SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, @@ -5117,10 +4934,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { Operands.push_back(DCI.DAG.getIntPtrConstant( cast<LoadSDNode>(LD)->getExtensionType(), DL)); break; - case NVPTXISD::LoadParamV2: - OldNumOutputs = 2; - Opcode = NVPTXISD::LoadParamV4; - break; case NVPTXISD::LoadV2: OldNumOutputs = 2; Opcode = NVPTXISD::LoadV4; @@ -5201,12 +5014,6 @@ static SDValue combinePackingMovIntoStore(SDNode *N, MemVT = ST->getMemoryVT(); Opcode = NVPTXISD::StoreV2; break; - case NVPTXISD::StoreParam: - Opcode = NVPTXISD::StoreParamV2; - break; - case NVPTXISD::StoreParamV2: - Opcode = NVPTXISD::StoreParamV4; - break; case NVPTXISD::StoreV2: MemVT = ST->getMemoryVT(); Opcode = NVPTXISD::StoreV4; @@ -5218,7 +5025,6 @@ static SDValue combinePackingMovIntoStore(SDNode *N, return SDValue(); Opcode = NVPTXISD::StoreV8; break; - case NVPTXISD::StoreParamV4: case NVPTXISD::StoreV8: // PTX doesn't support the next doubling of operands return SDValue(); @@ -5263,30 +5069,11 @@ static SDValue combinePackingMovIntoStore(SDNode *N, MemVT, ST->getMemOperand()); } -static SDValue PerformStoreCombineHelper(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - unsigned Front, unsigned Back) { - if (all_of(N->ops().drop_front(Front).drop_back(Back), - [](const SDUse &U) { return U.get()->isUndef(); })) - // Operand 0 is the previous value in the chain. Cannot return EntryToken - // as the previous value will become unused and eliminated later. - return N->getOperand(0); - - return combinePackingMovIntoStore(N, DCI, Front, Back); -} - static SDValue PerformStoreCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { return combinePackingMovIntoStore(N, DCI, 1, 2); } -static SDValue PerformStoreParamCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - // Operands from the 3rd to the 2nd last one are the values to be stored. - // {Chain, ArgID, Offset, Val, Glue} - return PerformStoreCombineHelper(N, DCI, 3, 1); -} - /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. /// static SDValue PerformADDCombine(SDNode *N, @@ -5942,6 +5729,86 @@ static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, N->getConstantOperandAPInt(2), N->getConstantOperandVal(3)), SDLoc(N), N->getValueType(0)); + return SDValue(); +} + +// During call lowering we wrap the return values in a ProxyReg node which +// depend on the chain value produced by the completed call. This ensures that +// the full call is emitted in cases where libcalls are used to legalize +// operations. To improve the functioning of other DAG combines we pull all +// operations we can through one of these nodes, ensuring that the ProxyReg +// directly wraps a load. That is: +// +// (ProxyReg (zext (load retval0))) => (zext (ProxyReg (load retval0))) +// +static SDValue sinkProxyReg(SDValue R, SDValue Chain, + TargetLowering::DAGCombinerInfo &DCI) { + switch (R.getOpcode()) { + case ISD::TRUNCATE: + case ISD::ANY_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::BITCAST: { + if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI)) + return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), V); + return SDValue(); + } + case ISD::SHL: + case ISD::SRL: + case ISD::SRA: + case ISD::OR: { + if (SDValue A = sinkProxyReg(R.getOperand(0), Chain, DCI)) + if (SDValue B = sinkProxyReg(R.getOperand(1), Chain, DCI)) + return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), A, B); + return SDValue(); + } + case ISD::Constant: + return R; + case ISD::LOAD: + case NVPTXISD::LoadV2: + case NVPTXISD::LoadV4: { + return DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(R), R.getValueType(), + {Chain, R}); + } + case ISD::BUILD_VECTOR: { + if (DCI.isBeforeLegalize()) + return SDValue(); + + SmallVector<SDValue, 16> Ops; + for (auto &Op : R->ops()) { + SDValue V = sinkProxyReg(Op, Chain, DCI); + if (!V) + return SDValue(); + Ops.push_back(V); + } + return DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(R), R.getValueType(), Ops); + } + case ISD::EXTRACT_VECTOR_ELT: { + if (DCI.isBeforeLegalize()) + return SDValue(); + + if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI)) + return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(R), + R.getValueType(), V, R.getOperand(1)); + return SDValue(); + } + default: + return SDValue(); + } +} + +static SDValue combineProxyReg(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + + SDValue Chain = N->getOperand(0); + SDValue Reg = N->getOperand(1); + + // If the ProxyReg is not wrapping a load, try to pull the operations through + // the ProxyReg. + if (Reg.getOpcode() != ISD::LOAD) { + if (SDValue V = sinkProxyReg(Reg, Chain, DCI)) + return V; + } return SDValue(); } @@ -5965,7 +5832,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, case ISD::FADD: return PerformFADDCombine(N, DCI, OptLevel); case ISD::LOAD: - case NVPTXISD::LoadParamV2: case NVPTXISD::LoadV2: case NVPTXISD::LoadV4: return combineUnpackingMovIntoLoad(N, DCI); @@ -5973,6 +5839,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, return PerformMULCombine(N, DCI, OptLevel); case NVPTXISD::PRMT: return combinePRMT(N, DCI, OptLevel); + case NVPTXISD::ProxyReg: + return combineProxyReg(N, DCI); case ISD::SETCC: return PerformSETCCCombine(N, DCI, STI.getSmVersion()); case ISD::SHL: @@ -5980,10 +5848,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SREM: case ISD::UREM: return PerformREMCombine(N, DCI, OptLevel); - case NVPTXISD::StoreParam: - case NVPTXISD::StoreParamV2: - case NVPTXISD::StoreParamV4: - return PerformStoreParamCombine(N, DCI); case ISD::STORE: case NVPTXISD::StoreV2: case NVPTXISD::StoreV4: @@ -6332,6 +6196,22 @@ static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, Results.push_back(NewValue.getValue(3)); } +static void replaceProxyReg(SDNode *N, SelectionDAG &DAG, + const TargetLowering &TLI, + SmallVectorImpl<SDValue> &Results) { + SDValue Chain = N->getOperand(0); + SDValue Reg = N->getOperand(1); + + MVT VT = TLI.getRegisterType(*DAG.getContext(), Reg.getValueType()); + + SDValue NewReg = DAG.getAnyExtOrTrunc(Reg, SDLoc(N), VT); + SDValue NewProxy = + DAG.getNode(NVPTXISD::ProxyReg, SDLoc(N), VT, {Chain, NewReg}); + SDValue Res = DAG.getAnyExtOrTrunc(NewProxy, SDLoc(N), N->getValueType(0)); + + Results.push_back(Res); +} + void NVPTXTargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { @@ -6349,6 +6229,9 @@ void NVPTXTargetLowering::ReplaceNodeResults( case ISD::CopyFromReg: ReplaceCopyFromReg_128(N, DAG, Results); return; + case NVPTXISD::ProxyReg: + replaceProxyReg(N, DAG, *this, Results); + return; } } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 228e2aa..cf72a1e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -38,7 +38,7 @@ enum NodeType : unsigned { /// This node represents a PTX call instruction. It's operands are as follows: /// /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns, - /// NumParams, Callee, Proto, InGlue) + /// NumParams, Callee, Proto) CALL, MoveParam, @@ -84,13 +84,7 @@ enum NodeType : unsigned { StoreV2, StoreV4, StoreV8, - LoadParam, - LoadParamV2, - LoadParamV4, - StoreParam, - StoreParamV2, - StoreParamV4, - LAST_MEMORY_OPCODE = StoreParamV4, + LAST_MEMORY_OPCODE = StoreV8, }; } diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 442b900..86d6f7c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1757,12 +1757,6 @@ def SDTDeclareArrayParam : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>; def SDTDeclareScalarParam : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; -def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>; -def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>; -def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>; -def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; -def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>; -def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>; def SDTMoveParamProfile : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisSameAs<0, 1>]>; def SDTProxyReg : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>; @@ -1774,104 +1768,20 @@ def declare_array_param : def declare_scalar_param : SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParam, [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; - -def LoadParam : - SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile, - [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; -def LoadParamV2 : - SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile, - [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; -def LoadParamV4 : - SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile, - [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; -def StoreParam : - SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def StoreParamV2 : - SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def StoreParamV4 : - SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; def MoveParam : SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>; def proxy_reg : SDNode<"NVPTXISD::ProxyReg", SDTProxyReg, [SDNPHasChain]>; /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns, - /// NumParams, Callee, Proto, InGlue) + /// NumParams, Callee, Proto) def SDTCallProfile : SDTypeProfile<0, 6, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<5, i32>]>; -def call : - SDNode<"NVPTXISD::CALL", SDTCallProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; - -let mayLoad = true in { - class LoadParamMemInst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs regclass:$dst), (ins Offseti32imm:$b), - !strconcat("ld.param", opstr, " \t$dst, [retval0$b];"), - []>; - - class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins Offseti32imm:$b), - !strconcat("ld.param.v2", opstr, - " \t{{$dst, $dst2}}, [retval0$b];"), []>; - - class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3, - regclass:$dst4), - (ins Offseti32imm:$b), - !strconcat("ld.param.v4", opstr, - " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0$b];"), - []>; -} - -let mayStore = true in { - - multiclass StoreParamInst<NVPTXRegClass regclass, Operand IMMType, string opstr, bit support_imm = true> { - foreach op = [IMMType, regclass] in - if !or(support_imm, !isa<NVPTXRegClass>(op)) then - def _ # !if(!isa<NVPTXRegClass>(op), "r", "i") - : NVPTXInst<(outs), - (ins op:$val, i32imm:$a, Offseti32imm:$b), - "st.param" # opstr # " \t[param$a$b], $val;", - []>; - } - - multiclass StoreParamV2Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> { - foreach op1 = [IMMType, regclass] in - foreach op2 = [IMMType, regclass] in - def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i") - # !if(!isa<NVPTXRegClass>(op2), "r", "i") - : NVPTXInst<(outs), - (ins op1:$val1, op2:$val2, - i32imm:$a, Offseti32imm:$b), - "st.param.v2" # opstr # " \t[param$a$b], {{$val1, $val2}};", - []>; - } - - multiclass StoreParamV4Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> { - foreach op1 = [IMMType, regclass] in - foreach op2 = [IMMType, regclass] in - foreach op3 = [IMMType, regclass] in - foreach op4 = [IMMType, regclass] in - def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i") - # !if(!isa<NVPTXRegClass>(op2), "r", "i") - # !if(!isa<NVPTXRegClass>(op3), "r", "i") - # !if(!isa<NVPTXRegClass>(op4), "r", "i") - - : NVPTXInst<(outs), - (ins op1:$val1, op2:$val2, op3:$val3, op4:$val4, - i32imm:$a, Offseti32imm:$b), - "st.param.v4" # opstr # - " \t[param$a$b], {{$val1, $val2, $val3, $val4}};", - []>; - } -} +def call : SDNode<"NVPTXISD::CALL", SDTCallProfile, [SDNPHasChain, SDNPSideEffect]>; /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns, -/// NumParams, Callee, Proto, InGlue) +/// NumParams, Callee, Proto) def CallOperand : Operand<i32> { let PrintMethod = "printCallOperand"; } @@ -1908,43 +1818,6 @@ foreach is_convergent = [0, 1] in { (call_uni_inst $addr, imm:$rets, imm:$params)>; } -def LoadParamMemI64 : LoadParamMemInst<B64, ".b64">; -def LoadParamMemI32 : LoadParamMemInst<B32, ".b32">; -def LoadParamMemI16 : LoadParamMemInst<B16, ".b16">; -def LoadParamMemI8 : LoadParamMemInst<B16, ".b8">; -def LoadParamMemV2I64 : LoadParamV2MemInst<B64, ".b64">; -def LoadParamMemV2I32 : LoadParamV2MemInst<B32, ".b32">; -def LoadParamMemV2I16 : LoadParamV2MemInst<B16, ".b16">; -def LoadParamMemV2I8 : LoadParamV2MemInst<B16, ".b8">; -def LoadParamMemV4I32 : LoadParamV4MemInst<B32, ".b32">; -def LoadParamMemV4I16 : LoadParamV4MemInst<B16, ".b16">; -def LoadParamMemV4I8 : LoadParamV4MemInst<B16, ".b8">; - -defm StoreParamI64 : StoreParamInst<B64, i64imm, ".b64">; -defm StoreParamI32 : StoreParamInst<B32, i32imm, ".b32">; -defm StoreParamI16 : StoreParamInst<B16, i16imm, ".b16">; -defm StoreParamI8 : StoreParamInst<B16, i8imm, ".b8">; - -defm StoreParamI8TruncI32 : StoreParamInst<B32, i8imm, ".b8", /* support_imm */ false>; -defm StoreParamI8TruncI64 : StoreParamInst<B64, i8imm, ".b8", /* support_imm */ false>; - -defm StoreParamV2I64 : StoreParamV2Inst<B64, i64imm, ".b64">; -defm StoreParamV2I32 : StoreParamV2Inst<B32, i32imm, ".b32">; -defm StoreParamV2I16 : StoreParamV2Inst<B16, i16imm, ".b16">; -defm StoreParamV2I8 : StoreParamV2Inst<B16, i8imm, ".b8">; - -defm StoreParamV4I32 : StoreParamV4Inst<B32, i32imm, ".b32">; -defm StoreParamV4I16 : StoreParamV4Inst<B16, i16imm, ".b16">; -defm StoreParamV4I8 : StoreParamV4Inst<B16, i8imm, ".b8">; - -defm StoreParamF32 : StoreParamInst<B32, f32imm, ".b32">; -defm StoreParamF64 : StoreParamInst<B64, f64imm, ".b64">; - -defm StoreParamV2F32 : StoreParamV2Inst<B32, f32imm, ".b32">; -defm StoreParamV2F64 : StoreParamV2Inst<B64, f64imm, ".b64">; - -defm StoreParamV4F32 : StoreParamV4Inst<B32, f32imm, ".b32">; - def DECLARE_PARAM_array : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$align, i32imm:$size), ".param .align $align .b8 \t$a[$size];", []>; diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index 5e54b82..67cc01e 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -534,16 +534,26 @@ static DecodeStatus decodeRTZArg(MCInst &Inst, uint32_t Imm, int64_t Address, return MCDisassembler::Success; } -static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder); - static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm, uint64_t Address, - const MCDisassembler *Decoder); + const MCDisassembler *Decoder) { + bool IsRVE = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureStdExtE); + if (Imm < RISCVZC::RA || (IsRVE && Imm >= RISCVZC::RA_S0_S2)) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(Imm)); + return MCDisassembler::Success; +} static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm, uint64_t Address, + const MCDisassembler *Decoder) { + if (Imm < RISCVZC::RA_S0) + return MCDisassembler::Fail; + return decodeZcmpRlist(Inst, Imm, Address, Decoder); +} + +static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn, + uint64_t Address, const MCDisassembler *Decoder); static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn, @@ -592,24 +602,6 @@ static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn, return S; } -static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm, - uint64_t Address, - const MCDisassembler *Decoder) { - bool IsRVE = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureStdExtE); - if (Imm < RISCVZC::RA || (IsRVE && Imm >= RISCVZC::RA_S0_S2)) - return MCDisassembler::Fail; - Inst.addOperand(MCOperand::createImm(Imm)); - return MCDisassembler::Success; -} - -static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm, - uint64_t Address, - const MCDisassembler *Decoder) { - if (Imm < RISCVZC::RA_S0) - return MCDisassembler::Fail; - return decodeZcmpRlist(Inst, Imm, Address, Decoder); -} - // Add implied SP operand for C.*SP compressed instructions. The SP operand // isn't explicitly encoded in the instruction. void RISCVDisassembler::addSPOperands(MCInst &MI) const { diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index 2c37c3b..82e3b5c 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -320,6 +320,7 @@ bool RISCVAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) { MCFixup Fixup = MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_RISCV_ALIGN); F.setVarFixups({Fixup}); + F.setLinkerRelaxable(); F.getParent()->setLinkerRelaxable(); return true; } diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index b1ab76a..9fc0d81 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -1581,7 +1581,8 @@ void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF, // Set the register and all its subregisters. if (!MRI.def_empty(CSReg) || MRI.getUsedPhysRegsMask().test(CSReg)) { SavedRegs.set(CSReg); - llvm::for_each(SubRegs, [&](unsigned Reg) { return SavedRegs.set(Reg); }); + for (unsigned Reg : SubRegs) + SavedRegs.set(Reg); } // Combine to super register if all of its subregisters are marked. diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 54845e5..607edd3 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2739,6 +2739,27 @@ bool RISCVTargetLowering::isLegalElementTypeForRVV(EVT ScalarTy) const { } } +bool RISCVTargetLowering::isLegalLoadStoreElementTypeForRVV( + EVT ScalarTy) const { + if (!ScalarTy.isSimple()) + return false; + switch (ScalarTy.getSimpleVT().SimpleTy) { + case MVT::iPTR: + return Subtarget.is64Bit() ? Subtarget.hasVInstructionsI64() : true; + case MVT::i8: + case MVT::i16: + case MVT::i32: + case MVT::f16: + case MVT::bf16: + case MVT::f32: + return true; + case MVT::i64: + case MVT::f64: + return Subtarget.hasVInstructionsI64(); + default: + return false; + } +} unsigned RISCVTargetLowering::combineRepeatedFPDivisors() const { return NumRepeatedDivisors; @@ -24239,7 +24260,7 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType, return false; EVT ScalarType = DataType.getScalarType(); - if (!isLegalElementTypeForRVV(ScalarType)) + if (!isLegalLoadStoreElementTypeForRVV(ScalarType)) return false; if (!Subtarget.enableUnalignedVectorMem() && diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index ca70c46..a788c0b7 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -384,6 +384,7 @@ public: bool shouldRemoveExtendFromGSIndex(SDValue Extend, EVT DataVT) const override; bool isLegalElementTypeForRVV(EVT ScalarTy) const; + bool isLegalLoadStoreElementTypeForRVV(EVT ScalarTy) const; bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override; diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index 30d8f85..3cbe668 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -32,7 +32,7 @@ bool RISCVTargetLowering::isLegalInterleavedAccessType( if (!isTypeLegal(VT)) return false; - if (!isLegalElementTypeForRVV(VT.getScalarType()) || + if (!isLegalLoadStoreElementTypeForRVV(VT.getScalarType()) || !allowsMemoryAccessForAlignment(VTy->getContext(), DL, VT, AddrSpace, Alignment)) return false; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index d62d99c..f0510ec 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -265,7 +265,7 @@ public: if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize()) return false; - return TLI->isLegalElementTypeForRVV(ElemType); + return TLI->isLegalLoadStoreElementTypeForRVV(ElemType); } bool isLegalMaskedLoad(Type *DataType, Align Alignment, @@ -297,7 +297,7 @@ public: if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize()) return false; - return TLI->isLegalElementTypeForRVV(ElemType); + return TLI->isLegalLoadStoreElementTypeForRVV(ElemType); } bool isLegalMaskedGather(Type *DataType, Align Alignment) const override { diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp index 3ef6030..72bb372 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp @@ -69,8 +69,8 @@ void SystemZHLASMAsmStreamer::EmitEOL() { void SystemZHLASMAsmStreamer::changeSection(MCSection *Section, uint32_t Subsection) { - Section->printSwitchToSection(*MAI, getContext().getTargetTriple(), OS, - Subsection); + MAI->printSwitchToSection(*Section, Subsection, + getContext().getTargetTriple(), OS); MCStreamer::changeSection(Section, Subsection); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index d13862f..143298b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1540,6 +1540,8 @@ multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate> def : Pat<(fadd_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))), (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>; + def : Pat<(fsub_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))), + (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>; } defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>; |