aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64Combine.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp81
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp91
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp34
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp22
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp28
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.h4
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp32
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h6
-rw-r--r--llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td8
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp1
-rw-r--r--llvm/lib/Target/Hexagon/HexagonPatterns.td21
-rw-r--r--llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp2
-rw-r--r--llvm/lib/Target/Mips/MipsInstrInfo.td16
-rw-r--r--llvm/lib/Target/RISCV/RISCVFeatures.td7
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td23
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrPredicates.td4
-rw-r--r--llvm/lib/Target/RISCV/RISCVProcessors.td3
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedSiFive7.td142
-rw-r--r--llvm/lib/Target/RISCV/RISCVScheduleV.td16
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp1
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp1
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp1
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp1
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp1
-rw-r--r--llvm/lib/Target/X86/X86ISelLoweringCall.cpp2
30 files changed, 380 insertions, 192 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 639ddcb..ecaeff7 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -350,7 +350,7 @@ def AArch64PostLegalizerLowering
// Post-legalization combines which are primarily optimizations.
def AArch64PostLegalizerCombiner
: GICombiner<"AArch64PostLegalizerCombinerImpl",
- [copy_prop, cast_of_cast_combines,
+ [copy_prop, cast_of_cast_combines, constant_fold_fp_ops,
buildvector_of_truncate, integer_of_truncate,
mutate_anyext_to_zext, combines_for_extload,
combine_indexed_load_store, sext_trunc_sextload,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 31b3d18..7294f3e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16249,7 +16249,9 @@ SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
bool Negated;
uint64_t SplatVal;
- if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
+ // NOTE: SRAD cannot be used to represent sdiv-by-one.
+ if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated) &&
+ SplatVal > 1) {
SDValue Pg = getPredicateForScalableVector(DAG, DL, VT);
SDValue Res =
DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0),
@@ -30034,7 +30036,9 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
bool Negated;
uint64_t SplatVal;
- if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
+ // NOTE: SRAD cannot be used to represent sdiv-by-one.
+ if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated) &&
+ SplatVal > 1) {
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32);
@@ -30606,6 +30610,43 @@ AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
assert(OpVT.isScalableVector() &&
"Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
+ if (Op->getNumOperands() == 3) {
+ // aarch64_sve_ld3 only supports packed datatypes.
+ EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount());
+ Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false);
+ SDValue StackPtr =
+ DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment);
+
+ // Write out unmodified operands.
+ SmallVector<SDValue, 3> Chains;
+ for (unsigned I = 0; I < 3; ++I) {
+ SDValue Ptr =
+ DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL);
+ SDValue V = getSVESafeBitCast(PackedVT, Op.getOperand(I), DAG);
+ Chains.push_back(
+ DAG.getStore(DAG.getEntryNode(), DL, V, Ptr, MachinePointerInfo()));
+ }
+
+ Intrinsic::ID IntID = Intrinsic::aarch64_sve_ld3_sret;
+ EVT PredVT = PackedVT.changeVectorElementType(MVT::i1);
+
+ SmallVector<SDValue, 7> Ops;
+ Ops.push_back(DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains));
+ Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
+ Ops.push_back(DAG.getConstant(1, DL, PredVT));
+ Ops.push_back(StackPtr);
+
+ // Read back and deinterleave data.
+ SDVTList VTs = DAG.getVTList(PackedVT, PackedVT, PackedVT, MVT::Other);
+ SDValue LD3 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops);
+
+ SmallVector<SDValue, 3> Results;
+ Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(0), DAG));
+ Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(1), DAG));
+ Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(2), DAG));
+ return DAG.getMergeValues(Results, DL);
+ }
+
// Are multi-register uzp instructions available?
if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
OpVT.getVectorElementType() != MVT::i1) {
@@ -30647,6 +30688,42 @@ SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
assert(OpVT.isScalableVector() &&
"Expected scalable vector in LowerVECTOR_INTERLEAVE.");
+ if (Op->getNumOperands() == 3) {
+ // aarch64_sve_st3 only supports packed datatypes.
+ EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount());
+ SmallVector<SDValue, 3> InVecs;
+ for (SDValue V : Op->ops())
+ InVecs.push_back(getSVESafeBitCast(PackedVT, V, DAG));
+
+ Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false);
+ SDValue StackPtr =
+ DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment);
+
+ Intrinsic::ID IntID = Intrinsic::aarch64_sve_st3;
+ EVT PredVT = PackedVT.changeVectorElementType(MVT::i1);
+
+ SmallVector<SDValue, 7> Ops;
+ Ops.push_back(DAG.getEntryNode());
+ Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
+ Ops.append(InVecs);
+ Ops.push_back(DAG.getConstant(1, DL, PredVT));
+ Ops.push_back(StackPtr);
+
+ // Interleave operands and store.
+ SDValue Chain = DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops);
+
+ // Read back the interleaved data.
+ SmallVector<SDValue, 3> Results;
+ for (unsigned I = 0; I < 3; ++I) {
+ SDValue Ptr =
+ DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL);
+ SDValue L = DAG.getLoad(PackedVT, DL, Chain, Ptr, MachinePointerInfo());
+ Results.push_back(getSVESafeBitCast(OpVT, L, DAG));
+ }
+
+ return DAG.getMergeValues(Results, DL);
+ }
+
// Are multi-register zip instructions available?
if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
OpVT.getVectorElementType() != MVT::i1) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 24bef82..8e35ba7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -15,6 +15,7 @@
#include "AMDGPU.h"
#include "AMDGPUTargetMachine.h"
#include "SIModeRegisterDefaults.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
@@ -27,6 +28,7 @@
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ValueHandle.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/KnownBits.h"
@@ -106,6 +108,7 @@ public:
bool FlowChanged = false;
mutable Function *SqrtF32 = nullptr;
mutable Function *LdexpF32 = nullptr;
+ mutable SmallVector<WeakVH> DeadVals;
DenseMap<const PHINode *, bool> BreakPhiNodesCache;
@@ -242,6 +245,8 @@ public:
Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
FastMathFlags FMF) const;
+ bool tryNarrowMathIfNoOverflow(Instruction *I);
+
public:
bool visitFDiv(BinaryOperator &I);
@@ -281,28 +286,21 @@ bool AMDGPUCodeGenPrepareImpl::run() {
BreakPhiNodesCache.clear();
bool MadeChange = false;
- Function::iterator NextBB;
- for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
- BasicBlock *BB = &*FI;
- NextBB = std::next(FI);
-
- BasicBlock::iterator Next;
- for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
- I = Next) {
- Next = std::next(I);
-
- MadeChange |= visit(*I);
-
- if (Next != E) { // Control flow changed
- BasicBlock *NextInstBB = Next->getParent();
- if (NextInstBB != BB) {
- BB = NextInstBB;
- E = BB->end();
- FE = F.end();
- }
- }
+ // Need to use make_early_inc_range because integer division expansion is
+ // handled by Transform/Utils, and it can delete instructions such as the
+ // terminator of the BB.
+ for (BasicBlock &BB : reverse(F)) {
+ for (Instruction &I : make_early_inc_range(reverse(BB))) {
+ if (!isInstructionTriviallyDead(&I, TLI))
+ MadeChange |= visit(I);
}
}
+
+ while (!DeadVals.empty()) {
+ if (auto *I = dyn_cast_or_null<Instruction>(DeadVals.pop_back_val()))
+ RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
+ }
+
return MadeChange;
}
@@ -422,7 +420,7 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
Value *NewVal = insertValues(Builder, Ty, ResultVals);
NewVal->takeName(&I);
I.replaceAllUsesWith(NewVal);
- I.eraseFromParent();
+ DeadVals.push_back(&I);
return true;
}
@@ -496,10 +494,10 @@ bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
FoldedT, FoldedF);
NewSelect->takeName(&BO);
BO.replaceAllUsesWith(NewSelect);
- BO.eraseFromParent();
+ DeadVals.push_back(&BO);
if (CastOp)
- CastOp->eraseFromParent();
- Sel->eraseFromParent();
+ DeadVals.push_back(CastOp);
+ DeadVals.push_back(Sel);
return true;
}
@@ -895,7 +893,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
if (NewVal) {
FDiv.replaceAllUsesWith(NewVal);
NewVal->takeName(&FDiv);
- RecursivelyDeleteTriviallyDeadInstructions(&FDiv, TLI);
+ DeadVals.push_back(&FDiv);
}
return true;
@@ -1302,10 +1300,7 @@ it will create `s_and_b32 s0, s0, 0xff`.
We accept this change since the non-byte load assumes the upper bits
within the byte are all 0.
*/
-static bool tryNarrowMathIfNoOverflow(Instruction *I,
- const SITargetLowering *TLI,
- const TargetTransformInfo &TTI,
- const DataLayout &DL) {
+bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *I) {
unsigned Opc = I->getOpcode();
Type *OldType = I->getType();
@@ -1330,6 +1325,7 @@ static bool tryNarrowMathIfNoOverflow(Instruction *I,
NewType = I->getType()->getWithNewBitWidth(NewBit);
// Old cost
+ const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
InstructionCost OldCost =
TTI.getArithmeticInstrCost(Opc, OldType, TTI::TCK_RecipThroughput);
// New cost of new op
@@ -1360,7 +1356,7 @@ static bool tryNarrowMathIfNoOverflow(Instruction *I,
Value *Zext = Builder.CreateZExt(Arith, OldType);
I->replaceAllUsesWith(Zext);
- I->eraseFromParent();
+ DeadVals.push_back(I);
return true;
}
@@ -1370,8 +1366,7 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
if (UseMul24Intrin && replaceMulWithMul24(I))
return true;
- if (tryNarrowMathIfNoOverflow(&I, ST.getTargetLowering(),
- TM.getTargetTransformInfo(F), DL))
+ if (tryNarrowMathIfNoOverflow(&I))
return true;
bool Changed = false;
@@ -1436,7 +1431,7 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
if (NewDiv) {
I.replaceAllUsesWith(NewDiv);
- I.eraseFromParent();
+ DeadVals.push_back(&I);
Changed = true;
}
}
@@ -1492,7 +1487,7 @@ bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
I.replaceAllUsesWith(ValOrig);
- I.eraseFromParent();
+ DeadVals.push_back(&I);
return true;
}
@@ -1534,7 +1529,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
Fract->takeName(&I);
I.replaceAllUsesWith(Fract);
- RecursivelyDeleteTriviallyDeadInstructions(&I, TLI);
+ DeadVals.push_back(&I);
return true;
}
@@ -1822,7 +1817,7 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
}
I.replaceAllUsesWith(Vec);
- I.eraseFromParent();
+ DeadVals.push_back(&I);
return true;
}
@@ -1903,7 +1898,7 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
auto *Intrin = B.CreateIntrinsic(
I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
I.replaceAllUsesWith(Intrin);
- I.eraseFromParent();
+ DeadVals.push_back(&I);
return true;
}
@@ -2000,16 +1995,10 @@ bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
Value *Fract = applyFractPat(Builder, FractArg);
Fract->takeName(&I);
I.replaceAllUsesWith(Fract);
-
- RecursivelyDeleteTriviallyDeadInstructions(&I, TLI);
+ DeadVals.push_back(&I);
return true;
}
-static bool isOneOrNegOne(const Value *Val) {
- const APFloat *C;
- return match(Val, m_APFloat(C)) && C->getExactLog2Abs() == 0;
-}
-
// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
Type *Ty = Sqrt.getType()->getScalarType();
@@ -2030,18 +2019,6 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
if (ReqdAccuracy < 1.0f)
return false;
- // FIXME: This is an ugly hack for this pass using forward iteration instead
- // of reverse. If it worked like a normal combiner, the rsq would form before
- // we saw a sqrt call.
- auto *FDiv =
- dyn_cast_or_null<FPMathOperator>(Sqrt.getUniqueUndroppableUser());
- if (FDiv && FDiv->getOpcode() == Instruction::FDiv &&
- FDiv->getFPAccuracy() >= 1.0f &&
- canOptimizeWithRsq(FPOp, FDiv->getFastMathFlags(), SqrtFMF) &&
- // TODO: We should also handle the arcp case for the fdiv with non-1 value
- isOneOrNegOne(FDiv->getOperand(0)))
- return false;
-
Value *SrcVal = Sqrt.getOperand(0);
bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
@@ -2065,7 +2042,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals);
NewSqrt->takeName(&Sqrt);
Sqrt.replaceAllUsesWith(NewSqrt);
- Sqrt.eraseFromParent();
+ DeadVals.push_back(&Sqrt);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 73b2660..5407566 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -468,6 +468,38 @@ void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
MI.eraseFromParent();
}
+void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
+ Register Lo, Hi;
+ switch (MI.getOpcode()) {
+ case AMDGPU::G_SMIN:
+ case AMDGPU::G_SMAX: {
+ // For signed operations, use sign extension
+ auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg());
+ auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg());
+ Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
+ .getReg(0);
+ Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
+ .getReg(0);
+ break;
+ }
+ case AMDGPU::G_UMIN:
+ case AMDGPU::G_UMAX: {
+ // For unsigned operations, use zero extension
+ auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg());
+ auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg());
+ Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
+ .getReg(0);
+ Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
+ .getReg(0);
+ break;
+ }
+ default:
+ llvm_unreachable("Unpack min/max lowering not implemented");
+ }
+ B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
+ MI.eraseFromParent();
+}
+
static bool isSignedBFE(MachineInstr &MI) {
if (GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI))
return (GI->is(Intrinsic::amdgcn_sbfe));
@@ -654,6 +686,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
}
case UnpackBitShift:
return lowerUnpackBitShift(MI);
+ case UnpackMinMax:
+ return lowerUnpackMinMax(MI);
case Ext32To64: {
const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
MachineInstrBuilder Hi;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index 7affe5a..d937815 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -123,6 +123,7 @@ private:
void lowerSplitTo32(MachineInstr &MI);
void lowerSplitTo32Select(MachineInstr &MI);
void lowerSplitTo32SExtInReg(MachineInstr &MI);
+ void lowerUnpackMinMax(MachineInstr &MI);
};
} // end namespace AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index f413bbc..bfe2c80 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -522,6 +522,22 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32, Sgpr32}, S_BFE})
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32, Vgpr32}, V_BFE});
+ addRulesForGOpcs({G_SMIN, G_SMAX}, Standard)
+ .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32SExt}})
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
+ .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+ .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackMinMax})
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
+
+ addRulesForGOpcs({G_UMIN, G_UMAX}, Standard)
+ .Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}})
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
+ .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+ .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackMinMax})
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
+
// Note: we only write S1 rules for G_IMPLICIT_DEF, G_CONSTANT, G_FCONSTANT
// and G_FREEZE here, rest is trivially regbankselected earlier
addRulesForGOpcs({G_IMPLICIT_DEF}).Any({{UniS1}, {{Sgpr32Trunc}, {}}});
@@ -617,6 +633,12 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{UniS64, S64}, {{Sgpr64}, {Sgpr64}}})
.Any({{DivS64, S64}, {{Vgpr64}, {Vgpr64}, SplitTo32SExtInReg}});
+ addRulesForGOpcs({G_ASSERT_ZEXT, G_ASSERT_SEXT}, Standard)
+ .Uni(S32, {{Sgpr32}, {Sgpr32, Imm}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Imm}})
+ .Uni(S64, {{Sgpr64}, {Sgpr64, Imm}})
+ .Div(S64, {{Vgpr64}, {Vgpr64, Imm}});
+
bool hasSMRDx3 = ST->hasScalarDwordx3Loads();
bool hasSMRDSmall = ST->hasScalarSubwordLoads();
bool usesTrue16 = ST->useRealTrue16Insts();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index d0c6910..93e0efd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -212,6 +212,7 @@ enum LoweringMethodID {
VccExtToSel,
UniExtToSel,
UnpackBitShift,
+ UnpackMinMax,
S_BFE,
V_BFE,
VgprToVccCopy,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 557d87f..56807a4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5053,16 +5053,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
//
// vdst, srcA, srcB, srcC
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+ bool UseAGPRForm = !Subtarget.hasGFX90AInsts() ||
+ Info->selectAGPRFormMFMA(MinNumRegsRequired);
+
OpdsMapping[0] =
- Info->getMinNumAGPRs() >= MinNumRegsRequired
- ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
- : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ UseAGPRForm ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
+ : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
OpdsMapping[4] =
- Info->getMinNumAGPRs() >= MinNumRegsRequired
- ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
- : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
+ : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
break;
}
case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
@@ -5115,11 +5117,21 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: {
+ Register DstReg = MI.getOperand(0).getReg();
+ unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
+ unsigned MinNumRegsRequired = DstSize / 32;
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ bool UseAGPRForm = Info->selectAGPRFormMFMA(MinNumRegsRequired);
+
// vdst, srcA, srcB, srcC, idx
- OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[0] = UseAGPRForm ? getAGPROpMapping(DstReg, MRI, *TRI)
+ : getVGPROpMapping(DstReg, MRI, *TRI);
+
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
- OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ OpdsMapping[4] =
+ UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
+ : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
break;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index ef63acc..71494be 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -905,7 +905,7 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
OS << ":\n";
SlotIndex MBBStartSlot = LIS.getSlotIndexes()->getMBBStartIdx(&MBB);
- SlotIndex MBBEndSlot = LIS.getSlotIndexes()->getMBBEndIdx(&MBB);
+ SlotIndex MBBLastSlot = LIS.getSlotIndexes()->getMBBLastIdx(&MBB);
GCNRPTracker::LiveRegSet LiveIn, LiveOut;
GCNRegPressure RPAtMBBEnd;
@@ -931,7 +931,7 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
}
} else {
GCNUpwardRPTracker RPT(LIS);
- RPT.reset(MRI, MBBEndSlot);
+ RPT.reset(MRI, MBBLastSlot);
LiveOut = RPT.getLiveRegs();
RPAtMBBEnd = RPT.getPressure();
@@ -966,14 +966,14 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
OS << PFX " Live-out:" << llvm::print(LiveOut, MRI);
if (UseDownwardTracker)
- ReportLISMismatchIfAny(LiveOut, getLiveRegs(MBBEndSlot, LIS, MRI));
+ ReportLISMismatchIfAny(LiveOut, getLiveRegs(MBBLastSlot, LIS, MRI));
GCNRPTracker::LiveRegSet LiveThrough;
for (auto [Reg, Mask] : LiveIn) {
LaneBitmask MaskIntersection = Mask & LiveOut.lookup(Reg);
if (MaskIntersection.any()) {
LaneBitmask LTMask = getRegLiveThroughMask(
- MRI, LIS, Reg, MBBStartSlot, MBBEndSlot, MaskIntersection);
+ MRI, LIS, Reg, MBBStartSlot, MBBLastSlot, MaskIntersection);
if (LTMask.any())
LiveThrough[Reg] = LTMask;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index a9c58bb..898d1ff 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -313,8 +313,8 @@ public:
/// reset tracker to the end of the \p MBB.
void reset(const MachineBasicBlock &MBB) {
- reset(MBB.getParent()->getRegInfo(),
- LIS.getSlotIndexes()->getMBBEndIdx(&MBB));
+ SlotIndex MBBLastSlot = LIS.getSlotIndexes()->getMBBLastIdx(&MBB);
+ reset(MBB.getParent()->getRegInfo(), MBBLastSlot);
}
/// reset tracker to the point just after \p MI (in program order).
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 730be69..80e985d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -103,52 +103,52 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::Untyped, V64RegClass);
addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
- addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
+ addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
- addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
+ addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
- addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
+ addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);
addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
- addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
+ addRegisterClass(MVT::v6f32, &AMDGPU::VReg_192RegClass);
addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
- addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
+ addRegisterClass(MVT::v3f64, &AMDGPU::VReg_192RegClass);
addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
- addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
+ addRegisterClass(MVT::v7f32, &AMDGPU::VReg_224RegClass);
addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
- addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
+ addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
- addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
+ addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass);
addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
- addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
+ addRegisterClass(MVT::v9f32, &AMDGPU::VReg_288RegClass);
addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
- addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
+ addRegisterClass(MVT::v10f32, &AMDGPU::VReg_320RegClass);
addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
- addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
+ addRegisterClass(MVT::v11f32, &AMDGPU::VReg_352RegClass);
addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
- addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
+ addRegisterClass(MVT::v12f32, &AMDGPU::VReg_384RegClass);
addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
- addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
+ addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
- addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
+ addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass);
addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
- addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
+ addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass);
if (Subtarget->has16BitInsts()) {
if (Subtarget->useRealTrue16Insts()) {
@@ -180,7 +180,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
- addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
+ addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass);
computeRegisterProperties(Subtarget->getRegisterInfo());
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index b7dbb59..2c1a13c 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -1202,6 +1202,12 @@ public:
unsigned getMinNumAGPRs() const { return MinNumAGPRs; }
+ /// Return true if an MFMA that requires at least \p NumRegs should select to
+ /// the AGPR form, instead of the VGPR form.
+ bool selectAGPRFormMFMA(unsigned NumRegs) const {
+ return !MFMAVGPRForm && getMinNumAGPRs() >= NumRegs;
+ }
+
// \returns true if a function has a use of AGPRs via inline asm or
// has a call which may use it.
bool mayUseAGPRs(const Function &F) const;
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 4d3331a..c684f9e 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -674,15 +674,9 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/true);
MachineOperand HiDstOp = Op0HOp1H->getOperand(0);
- if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
- Op0LOp1L->setFlag(MachineInstr::MIFlag::NoFPExcept);
- Op0HOp1H->setFlag(MachineInstr::MIFlag::NoFPExcept);
- }
- if (I.getFlag(MachineInstr::MIFlag::FmContract)) {
- Op0LOp1L->setFlag(MachineInstr::MIFlag::FmContract);
- Op0HOp1H->setFlag(MachineInstr::MIFlag::FmContract);
- }
-
+ uint32_t IFlags = I.getFlags();
+ Op0LOp1L->setFlags(IFlags);
+ Op0HOp1H->setFlags(IFlags);
LoDstOp.setIsRenamable(DstOp.isRenamable());
HiDstOp.setIsRenamable(DstOp.isRenamable());
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 7cfd059..6500fce 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -964,14 +964,12 @@ class MAIFrag<SDPatternOperator Op, bit HasAbid = true, bit Scaled = false> : Pa
class CanUseAGPR_MAI<ValueType vt> {
code PredicateCode = [{
return !Subtarget->hasGFX90AInsts() ||
- (!SIMachineFunctionInfo::MFMAVGPRForm &&
- MF->getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >=
- }] # !srl(vt.Size, 5) # ");";
+ MF->getInfo<SIMachineFunctionInfo>()->selectAGPRFormMFMA(
+ }] # !srl(vt.Size, 5) # ");";
code GISelPredicateCode = [{
return !Subtarget->hasGFX90AInsts() ||
- (!SIMachineFunctionInfo::MFMAVGPRForm &&
- MF.getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >=
+ MF.getInfo<SIMachineFunctionInfo>()->selectAGPRFormMFMA(
}] # !srl(vt.Size, 5) # ");";
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 2a40fb9..83c7def 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -42,7 +42,6 @@
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/ComplexDeinterleavingPass.h"
#include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
index a0acfcf..85ce944 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -699,35 +699,20 @@ def: OpR_RR_pat<C2_cmpgtp, setgt, i1, I64>;
def: OpR_RR_pat<C2_cmpgtup, setugt, i1, I64>;
def: OpR_RR_pat<C2_cmpgtp, RevCmp<setlt>, i1, I64>;
def: OpR_RR_pat<C2_cmpgtup, RevCmp<setult>, i1, I64>;
-def: OpR_RR_pat<A2_vcmpbeq, seteq, i1, V8I8>;
def: OpR_RR_pat<A2_vcmpbeq, seteq, v8i1, V8I8>;
-def: OpR_RR_pat<A4_vcmpbgt, RevCmp<setlt>, i1, V8I8>;
def: OpR_RR_pat<A4_vcmpbgt, RevCmp<setlt>, v8i1, V8I8>;
-def: OpR_RR_pat<A4_vcmpbgt, setgt, i1, V8I8>;
def: OpR_RR_pat<A4_vcmpbgt, setgt, v8i1, V8I8>;
-def: OpR_RR_pat<A2_vcmpbgtu, RevCmp<setult>, i1, V8I8>;
def: OpR_RR_pat<A2_vcmpbgtu, RevCmp<setult>, v8i1, V8I8>;
-def: OpR_RR_pat<A2_vcmpbgtu, setugt, i1, V8I8>;
def: OpR_RR_pat<A2_vcmpbgtu, setugt, v8i1, V8I8>;
-def: OpR_RR_pat<A2_vcmpheq, seteq, i1, V4I16>;
def: OpR_RR_pat<A2_vcmpheq, seteq, v4i1, V4I16>;
-def: OpR_RR_pat<A2_vcmphgt, RevCmp<setlt>, i1, V4I16>;
def: OpR_RR_pat<A2_vcmphgt, RevCmp<setlt>, v4i1, V4I16>;
-def: OpR_RR_pat<A2_vcmphgt, setgt, i1, V4I16>;
def: OpR_RR_pat<A2_vcmphgt, setgt, v4i1, V4I16>;
-def: OpR_RR_pat<A2_vcmphgtu, RevCmp<setult>, i1, V4I16>;
def: OpR_RR_pat<A2_vcmphgtu, RevCmp<setult>, v4i1, V4I16>;
-def: OpR_RR_pat<A2_vcmphgtu, setugt, i1, V4I16>;
def: OpR_RR_pat<A2_vcmphgtu, setugt, v4i1, V4I16>;
-def: OpR_RR_pat<A2_vcmpweq, seteq, i1, V2I32>;
def: OpR_RR_pat<A2_vcmpweq, seteq, v2i1, V2I32>;
-def: OpR_RR_pat<A2_vcmpwgt, RevCmp<setlt>, i1, V2I32>;
def: OpR_RR_pat<A2_vcmpwgt, RevCmp<setlt>, v2i1, V2I32>;
-def: OpR_RR_pat<A2_vcmpwgt, setgt, i1, V2I32>;
def: OpR_RR_pat<A2_vcmpwgt, setgt, v2i1, V2I32>;
-def: OpR_RR_pat<A2_vcmpwgtu, RevCmp<setult>, i1, V2I32>;
def: OpR_RR_pat<A2_vcmpwgtu, RevCmp<setult>, v2i1, V2I32>;
-def: OpR_RR_pat<A2_vcmpwgtu, setugt, i1, V2I32>;
def: OpR_RR_pat<A2_vcmpwgtu, setugt, v2i1, V2I32>;
def: OpR_RR_pat<F2_sfcmpeq, seteq, i1, F32>;
@@ -1213,12 +1198,6 @@ def: OpR_RI_pat<S2_asl_i_r, Shl, i32, I32, u5_0ImmPred>;
def: OpR_RI_pat<S2_asr_i_p, Sra, i64, I64, u6_0ImmPred>;
def: OpR_RI_pat<S2_lsr_i_p, Srl, i64, I64, u6_0ImmPred>;
def: OpR_RI_pat<S2_asl_i_p, Shl, i64, I64, u6_0ImmPred>;
-def: OpR_RI_pat<S2_asr_i_vh, Sra, v4i16, V4I16, u4_0ImmPred>;
-def: OpR_RI_pat<S2_lsr_i_vh, Srl, v4i16, V4I16, u4_0ImmPred>;
-def: OpR_RI_pat<S2_asl_i_vh, Shl, v4i16, V4I16, u4_0ImmPred>;
-def: OpR_RI_pat<S2_asr_i_vh, Sra, v2i32, V2I32, u5_0ImmPred>;
-def: OpR_RI_pat<S2_lsr_i_vh, Srl, v2i32, V2I32, u5_0ImmPred>;
-def: OpR_RI_pat<S2_asl_i_vh, Shl, v2i32, V2I32, u5_0ImmPred>;
def: OpR_RR_pat<S2_asr_r_r, Sra, i32, I32, I32>;
def: OpR_RR_pat<S2_lsr_r_r, Srl, i32, I32, I32>;
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index ba70c9e..97379d7 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -3677,7 +3677,7 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
Out, STI))
return true;
- if (IsLikely) {
+ if (IsLikely && MemOffsetOp.isExpr()) {
TOut.emitRRX(OpCode, DstRegOp.getReg(), ATReg,
MCOperand::createExpr(MemOffsetOp.getExpr()), IDLoc, STI);
TOut.emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI);
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.td b/llvm/lib/Target/Mips/MipsInstrInfo.td
index eff80e5..21d8ded 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.td
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.td
@@ -855,6 +855,16 @@ def calltarget : Operand<iPTR> {
def imm64: Operand<i64>;
+def ConstantImmAsmOperandClass : AsmOperandClass {
+ let Name = "ConstantImm";
+ let PredicateMethod = "isConstantImm";
+ let RenderMethod = "addImmOperands";
+}
+
+def ConstantImm64: Operand<i64> {
+ let ParserMatchClass = ConstantImmAsmOperandClass;
+}
+
def simm19_lsl2 : Operand<i32> {
let EncoderMethod = "getSimm19Lsl2Encoding";
let DecoderMethod = "DecodeSimm19Lsl2";
@@ -2947,10 +2957,10 @@ def : MipsInstAlias<"nor\t$rs, $imm", (NORImm GPR32Opnd:$rs, GPR32Opnd:$rs,
let hasDelaySlot = 1, isCTI = 1 in {
def BneImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt),
- (ins imm64:$imm64, brtarget:$offset),
+ (ins ConstantImm64:$imm64, brtarget:$offset),
"bne\t$rt, $imm64, $offset">;
def BeqImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt),
- (ins imm64:$imm64, brtarget:$offset),
+ (ins ConstantImm64:$imm64, brtarget:$offset),
"beq\t$rt, $imm64, $offset">;
class CondBranchPseudo<string instr_asm> :
@@ -2978,7 +2988,7 @@ def BGTUL: CondBranchPseudo<"bgtul">, ISA_MIPS2_NOT_32R6_64R6;
let isCTI = 1 in
class CondBranchImmPseudo<string instr_asm> :
- MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, imm64:$imm, brtarget:$offset),
+ MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, ConstantImm64:$imm, brtarget:$offset),
!strconcat(instr_asm, "\t$rs, $imm, $offset")>;
def BEQLImmMacro : CondBranchImmPseudo<"beql">, ISA_MIPS2_NOT_32R6_64R6;
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 40c05e8..5ceb477 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1520,6 +1520,8 @@ def HasVendorXqcics
: Predicate<"Subtarget->hasVendorXqcics()">,
AssemblerPredicate<(all_of FeatureVendorXqcics),
"'Xqcics' (Qualcomm uC Conditional Select Extension)">;
+def NoVendorXqcics
+ : Predicate<"!Subtarget->hasVendorXqcics()">;
def FeatureVendorXqcicsr
: RISCVExperimentalExtension<0, 4, "Qualcomm uC CSR Extension">;
@@ -1823,6 +1825,11 @@ def TuneConditionalCompressedMoveFusion
def HasConditionalMoveFusion : Predicate<"Subtarget->hasConditionalMoveFusion()">;
def NoConditionalMoveFusion : Predicate<"!Subtarget->hasConditionalMoveFusion()">;
+def TuneHasSingleElementVecFP64
+ : SubtargetFeature<"single-element-vec-fp64", "HasSingleElementVectorFP64", "true",
+ "Certain vector FP64 operations produce a single result "
+ "element per cycle">;
+
def TuneMIPSP8700
: SubtargetFeature<"mips-p8700", "RISCVProcFamily", "MIPSP8700",
"MIPS p8700 processor">;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index f2724c41..5e1d07a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -1571,35 +1571,42 @@ def : QCIMVCCIPat<SETUGE, QC_MVGEUI, uimm5nonzero>;
}
let Predicates = [HasVendorXqcicli, IsRV32] in {
-def : QCILICCPat<SETEQ, QC_LIEQ>;
-def : QCILICCPat<SETNE, QC_LINE>;
def : QCILICCPat<SETLT, QC_LILT>;
def : QCILICCPat<SETGE, QC_LIGE>;
def : QCILICCPat<SETULT, QC_LILTU>;
def : QCILICCPat<SETUGE, QC_LIGEU>;
-def : QCILICCIPat<SETEQ, QC_LIEQI, simm5>;
-def : QCILICCIPat<SETNE, QC_LINEI, simm5>;
def : QCILICCIPat<SETLT, QC_LILTI, simm5>;
def : QCILICCIPat<SETGE, QC_LIGEI, simm5>;
def : QCILICCIPat<SETULT, QC_LILTUI, uimm5>;
def : QCILICCIPat<SETUGE, QC_LIGEUI, uimm5>;
-def : QCILICCPatInv<SETNE, QC_LIEQ>;
-def : QCILICCPatInv<SETEQ, QC_LINE>;
def : QCILICCPatInv<SETGE, QC_LILT>;
def : QCILICCPatInv<SETLT, QC_LIGE>;
def : QCILICCPatInv<SETUGE, QC_LILTU>;
def : QCILICCPatInv<SETULT, QC_LIGEU>;
-def : QCILICCIPatInv<SETNE, QC_LIEQI, simm5>;
-def : QCILICCIPatInv<SETEQ, QC_LINEI, simm5>;
def : QCILICCIPatInv<SETGE, QC_LILTI, simm5>;
def : QCILICCIPatInv<SETLT, QC_LIGEI, simm5>;
def : QCILICCIPatInv<SETUGE, QC_LILTUI, uimm5>;
def : QCILICCIPatInv<SETULT, QC_LIGEUI, uimm5>;
} // Predicates = [HasVendorXqcicli, IsRV32]
+// Prioritize Xqcics over these patterns.
+let Predicates = [HasVendorXqcicli, NoVendorXqcics, IsRV32] in {
+def : QCILICCPat<SETEQ, QC_LIEQ>;
+def : QCILICCPat<SETNE, QC_LINE>;
+
+def : QCILICCIPat<SETEQ, QC_LIEQI, simm5>;
+def : QCILICCIPat<SETNE, QC_LINEI, simm5>;
+
+def : QCILICCPatInv<SETNE, QC_LIEQ>;
+def : QCILICCPatInv<SETEQ, QC_LINE>;
+
+def : QCILICCIPatInv<SETNE, QC_LIEQI, simm5>;
+def : QCILICCIPatInv<SETEQ, QC_LINEI, simm5>;
+} // Predicates = [HasVendorXqcicli, NoVendorXqcics, IsRV32]
+
let Predicates = [HasVendorXqcics, IsRV32] in {
// (SELECT X, Y, Z) is canonicalised to `(riscv_selectcc x, 0, NE, y, z)`.
// These exist to prioritise over the `Select_GPR_Using_CC_GPR` pattern.
diff --git a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
index 6d86aff..3658817 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
@@ -14,6 +14,10 @@
// otherwise.
def VLDSX0Pred : MCSchedPredicate<CheckRegOperand<3, X0>>;
+// This scheduling predicate is true when subtarget feature TuneHasSingleElementVecFP64
+// is enabled.
+def SingleElementVecFP64SchedPred : FeatureSchedPredicate<TuneHasSingleElementVecFP64>;
+
// Returns true if this is the sext.w pattern, addiw rd, rs1, 0.
def isSEXT_W
: TIIPredicate<"isSEXT_W",
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 17a7948..e86431f 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -338,7 +338,8 @@ def SIFIVE_X390 : RISCVProcessorModel<"sifive-x390",
FeatureStdExtZvl1024b,
FeatureVendorXSiFivecdiscarddlone,
FeatureVendorXSiFivecflushdlone],
- SiFiveIntelligenceTuneFeatures>;
+ !listconcat(SiFiveIntelligenceTuneFeatures,
+ [TuneHasSingleElementVecFP64])>;
defvar SiFiveP400TuneFeatures = [TuneNoDefaultUnroll,
TuneConditionalCompressedMoveFusion,
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 3e07eff..f863392a 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -317,7 +317,6 @@ multiclass SiFive7WriteResBase<int VLEN,
ProcResourceKind VL, ProcResourceKind VS,
ProcResourceKind VCQ,
SiFive7FPLatencies fpLatencies,
- bit isFP64Throttled = false,
bit hasFastGather = false> {
// Branching
@@ -832,29 +831,56 @@ multiclass SiFive7WriteResBase<int VLEN,
// 13. Vector Floating-Point Instructions
foreach mx = SchedMxListF in {
foreach sew = SchedSEWSet<mx, isF=1>.val in {
- defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 64)),
- SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
- SiFive7GetCyclesDefault<mx>.c);
- defvar Lat8 = !if(!and(isFP64Throttled, !eq(sew, 64)), Cycles, 8);
- defvar VA = !if(!and(isFP64Throttled, !eq(sew, 64)), VA1, VA1OrVA2);
defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
- let Latency = Lat8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
- defm : LMULSEWWriteResMXSEW<"WriteVFALUV", [VCQ, VA], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFALUF", [VCQ, VA], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFMulV", [VCQ, VA], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFMulF", [VCQ, VA], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [VCQ, VA], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [VCQ, VA], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFRecpV", [VCQ, VA1], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
- }
- defvar Lat4 = !if(!and(isFP64Throttled, !eq(sew, 64)), Cycles, 4);
- let Latency = Lat4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
- defm : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [VCQ, VA], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [VCQ, VA], mx, sew, IsWorstCase>;
- // min max require merge
- defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [VCQ, VA1], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [VCQ, VA1], mx, sew, IsWorstCase>;
+ if !eq(sew, 64) then {
+ defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+ foreach SchedWriteName = ["WriteVFALUV", "WriteVFALUF", "WriteVFMulV", "WriteVFMulF",
+ "WriteVFMulAddV", "WriteVFMulAddF"] in
+ defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+ // Predicated
+ [VCQ, VA1], !add(SingleElementCycles, 7), [0, 1], [1, !add(1, SingleElementCycles)],
+ // Not Predicated
+ [VCQ, VA1OrVA2], 8, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)],
+ mx, sew, IsWorstCase>;
+ foreach SchedWriteName = ["WriteVFRecpV", "WriteVFCvtIToFV"] in
+ defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+ // Predicated
+ [VCQ, VA1], !add(SingleElementCycles, 7), [0, 1], [1, !add(1, SingleElementCycles)],
+ // Not Predicated
+ [VCQ, VA1], 8, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)],
+ mx, sew, IsWorstCase>;
+ foreach SchedWriteName = ["WriteVFSgnjV", "WriteVFSgnjF"] in
+ defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+ // Predicated
+ [VCQ, VA1], !add(SingleElementCycles, 3), [0, 1], [1, !add(1, SingleElementCycles)],
+ // Not Predicated
+ [VCQ, VA1OrVA2], 4, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)],
+ mx, sew, IsWorstCase>;
+ foreach SchedWriteName = ["WriteVFMinMaxV", "WriteVFMinMaxF"] in
+ defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+ // Predicated
+ [VCQ, VA1], !add(SingleElementCycles, 3), [0, 1], [1, !add(1, SingleElementCycles)],
+ // Not Predicated
+ [VCQ, VA1], 4, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)],
+ mx, sew, IsWorstCase>;
+ } else {
+ let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, SiFive7GetCyclesDefault<mx>.c)] in {
+ defm : LMULSEWWriteResMXSEW<"WriteVFALUV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFALUF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFMulV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFMulF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFRecpV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ }
+ let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, SiFive7GetCyclesDefault<mx>.c)] in {
+ defm : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ // min max require merge
+ defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [VCQ, VA1], mx, sew, IsWorstCase>;
+ }
}
}
}
@@ -892,19 +918,28 @@ multiclass SiFive7WriteResBase<int VLEN,
// Widening
foreach mx = SchedMxListW in {
foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
- defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 32)),
- SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
- SiFive7GetCyclesDefault<mx>.c);
defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c;
- let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
- defm : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ defvar DefaultCycles = SiFive7GetCyclesDefault<mx>.c;
+ if !eq(sew, 32) then {
+ defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+ defm : LMULSEWWriteResMXSEWVariant<"WriteVFWCvtIToFV", SingleElementVecFP64SchedPred,
+ // Predicated
+ [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)],
+ // Not Predicated
+ [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)],
+ mx, sew, IsWorstCase>;
+ } else {
+ let Latency = 8,
+ AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in
+ defm : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ }
}
}
foreach mx = SchedMxListFW in {
foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
- defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
+ defvar DefaultCycles = SiFive7GetCyclesDefault<mx>.c;
defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
- let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+ let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in {
defm : LMULSEWWriteResMXSEW<"WriteVFWALUV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
defm : LMULSEWWriteResMXSEW<"WriteVFWALUF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
defm : LMULSEWWriteResMXSEW<"WriteVFWMulV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
@@ -912,11 +947,19 @@ multiclass SiFive7WriteResBase<int VLEN,
defm : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
defm : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
}
- defvar CvtCycles = !if(!and(isFP64Throttled, !eq(sew, 32)),
- SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
- SiFive7GetCyclesDefault<mx>.c);
- let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, CvtCycles)] in
- defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ if !eq(sew, 32) then {
+ defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+ defm : LMULSEWWriteResMXSEWVariant<"WriteVFWCvtFToFV", SingleElementVecFP64SchedPred,
+ // Predicated
+ [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)],
+ // Not Predicated
+ [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)],
+ mx, sew, IsWorstCase>;
+ } else {
+ let Latency = 8,
+ AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in
+ defm : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ }
}
defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxListFW>.c;
@@ -933,13 +976,23 @@ multiclass SiFive7WriteResBase<int VLEN,
}
foreach mx = SchedMxListFW in {
foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
- defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 32)),
- SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
- SiFive7GetCyclesNarrowing<mx>.c);
defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
- let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
- defm : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ defvar DefaultCycles = SiFive7GetCyclesNarrowing<mx>.c;
+ if !eq(sew, 32) then {
+ defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+ foreach SchedWriteName = ["WriteVFNCvtIToFV", "WriteVFNCvtFToFV"] in
+ defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+ // Predicated
+ [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)],
+ // Not Predicated
+ [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)],
+ mx, sew, IsWorstCase>;
+ } else {
+ let Latency = 8,
+ AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in {
+ defm : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ }
}
}
}
@@ -1499,7 +1552,6 @@ multiclass SiFive7ReadAdvance {
/// eventually be supplied by different SchedMachineModels.
multiclass SiFive7SchedResources<int vlen, bit extraVALU,
SiFive7FPLatencies fpLatencies,
- bit isFP64Throttled,
bit hasFastGather> {
defm SiFive7 : SiFive7ProcResources<extraVALU>;
@@ -1527,8 +1579,7 @@ multiclass SiFive7SchedResources<int vlen, bit extraVALU,
: SiFive7WriteResBase<vlen, SiFive7PipeA, SiFive7PipeB, SiFive7PipeAB,
SiFive7IDiv, SiFive7FDiv, SiFive7VA1,
SiFive7VA1OrVA2, SiFive7VL, SiFive7VS,
- SiFive7VCQ, fpLatencies, isFP64Throttled,
- hasFastGather>;
+ SiFive7VCQ, fpLatencies, hasFastGather>;
//===----------------------------------------------------------------------===//
// Bypass and advance
@@ -1560,7 +1611,6 @@ class SiFive7SchedMachineModel<int vlen> : SchedMachineModel {
bit HasExtraVALU = false;
SiFive7FPLatencies FPLatencies;
- bit IsFP64Throttled = false;
bit HasFastGather = false;
string Name = !subst("Model", "", !subst("SiFive7", "", NAME));
@@ -1587,7 +1637,6 @@ def SiFive7VLEN512Model : SiFive7SchedMachineModel<512> {
def SiFive7VLEN1024X300Model : SiFive7SchedMachineModel<1024> {
let HasExtraVALU = true;
let FPLatencies = SiFive7LowFPLatencies;
- let IsFP64Throttled = true;
let HasFastGather = true;
}
@@ -1596,7 +1645,6 @@ foreach model = [SiFive7VLEN512Model, SiFive7VLEN1024X300Model] in {
let SchedModel = model in
defm model.Name : SiFive7SchedResources<model.VLEN, model.HasExtraVALU,
model.FPLatencies,
- model.IsFP64Throttled,
model.HasFastGather>;
}
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td
index 01a4308..d11b446 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleV.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td
@@ -128,6 +128,22 @@ multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred,
IsWorstCase>;
}
+multiclass LMULSEWWriteResMXSEWVariant<string name, SchedPredicateBase Pred,
+ list<ProcResourceKind> predResources,
+ int predLat, list<int> predAcquireCycles,
+ list<int> predReleaseCycles,
+ list<ProcResourceKind> noPredResources,
+ int noPredLat, list<int> noPredAcquireCycles,
+ list<int> noPredReleaseCycles,
+ string mx, int sew, bit IsWorstCase> {
+ defm "" : LMULWriteResVariantImpl<name, name # "_" # mx # "_E" # sew, Pred, predResources,
+ predLat, predAcquireCycles,
+ predReleaseCycles, noPredResources,
+ noPredLat, noPredAcquireCycles,
+ noPredReleaseCycles,
+ IsWorstCase>;
+}
+
// Define multiclasses to define SchedWrite, SchedRead, WriteRes, and
// ReadAdvance for each (name, LMUL) pair and for each LMUL in each of the
// SchedMxList variants above. Each multiclass is responsible for defining
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
index e8c849e..28a1690 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
@@ -46,7 +46,6 @@
#include "SPIRVSubtarget.h"
#include "SPIRVTargetMachine.h"
#include "SPIRVUtils.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
diff --git a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
index 20f03b0..60d39c9 100644
--- a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
@@ -19,7 +19,6 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Intrinsics.h"
diff --git a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
index 278ad7c..e621bcd44 100644
--- a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
@@ -14,7 +14,6 @@
#include "SPIRV.h"
#include "SPIRVSubtarget.h"
#include "SPIRVUtils.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/Transforms/Utils/Cloning.h"
diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
index 1811492..5b149f8 100644
--- a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
@@ -16,7 +16,6 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9580ade..1cfcb1f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -28,7 +28,6 @@
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 3bc46af..6dd43b2 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -547,7 +547,7 @@ unsigned X86TargetLowering::getAddressSpace() const {
static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
- (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
+ TargetTriple.isAndroid();
}
static Constant* SegmentOffset(IRBuilderBase &IRB,