diff options
Diffstat (limited to 'llvm/lib/Target')
123 files changed, 3500 insertions, 3050 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 12fc976..201bfe0 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1205,32 +1205,36 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, Register DstReg = MI.getOperand(0).getReg(); if (DstReg == MI.getOperand(3).getReg()) { // Expand to BIT - BuildMI(MBB, MBBI, MI.getDebugLoc(), - TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BITv8i8 - : AArch64::BITv16i8)) - .add(MI.getOperand(0)) - .add(MI.getOperand(3)) - .add(MI.getOperand(2)) - .add(MI.getOperand(1)); + auto I = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BITv8i8 + : AArch64::BITv16i8)) + .add(MI.getOperand(0)) + .add(MI.getOperand(3)) + .add(MI.getOperand(2)) + .add(MI.getOperand(1)); + transferImpOps(MI, I, I); } else if (DstReg == MI.getOperand(2).getReg()) { // Expand to BIF - BuildMI(MBB, MBBI, MI.getDebugLoc(), - TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BIFv8i8 - : AArch64::BIFv16i8)) - .add(MI.getOperand(0)) - .add(MI.getOperand(2)) - .add(MI.getOperand(3)) - .add(MI.getOperand(1)); + auto I = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BIFv8i8 + : AArch64::BIFv16i8)) + .add(MI.getOperand(0)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(1)); + transferImpOps(MI, I, I); } else { // Expand to BSL, use additional move if required if (DstReg == MI.getOperand(1).getReg()) { - BuildMI(MBB, MBBI, MI.getDebugLoc(), - TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8 - : AArch64::BSLv16i8)) - .add(MI.getOperand(0)) - .add(MI.getOperand(1)) - .add(MI.getOperand(2)) - .add(MI.getOperand(3)); + auto I = + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8 + : AArch64::BSLv16i8)) + .add(MI.getOperand(0)) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)); + transferImpOps(MI, I, I); } else { BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::ORRv8i8 @@ -1240,15 +1244,17 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, getRenamableRegState(MI.getOperand(0).isRenamable())) .add(MI.getOperand(1)) .add(MI.getOperand(1)); - BuildMI(MBB, MBBI, MI.getDebugLoc(), - TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8 - : AArch64::BSLv16i8)) - .add(MI.getOperand(0)) - .addReg(DstReg, - RegState::Kill | - getRenamableRegState(MI.getOperand(0).isRenamable())) - .add(MI.getOperand(2)) - .add(MI.getOperand(3)); + auto I2 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8 + : AArch64::BSLv16i8)) + .add(MI.getOperand(0)) + .addReg(DstReg, + RegState::Kill | getRenamableRegState( + MI.getOperand(0).isRenamable())) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)); + transferImpOps(MI, I2, I2); } } MI.eraseFromParent(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index f026726..ef3e8c8 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -164,6 +164,9 @@ static cl::opt<bool> UseFEATCPACodegen( /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; +/// Value type used for NZCV flags. +static constexpr MVT FlagsVT = MVT::i32; + static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7}; @@ -3451,7 +3454,7 @@ static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &DL, } unsigned Opcode = IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP; - return DAG.getNode(Opcode, DL, {MVT::i32, MVT::Other}, {Chain, LHS, RHS}); + return DAG.getNode(Opcode, DL, {FlagsVT, MVT::Other}, {Chain, LHS, RHS}); } static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, @@ -3465,7 +3468,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS); RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS); } - return DAG.getNode(AArch64ISD::FCMP, DL, MVT::i32, LHS, RHS); + return DAG.getNode(AArch64ISD::FCMP, DL, FlagsVT, LHS, RHS); } // The CMP instruction is just an alias for SUBS, and representing it as @@ -3490,7 +3493,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, // (a.k.a. ANDS) except that the flags are only guaranteed to work for one // of the signed comparisons. const SDValue ANDSNode = - DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, MVT_CC), + DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, FlagsVT), LHS.getOperand(0), LHS.getOperand(1)); // Replace all users of (and X, Y) with newly generated (ands X, Y) DAG.ReplaceAllUsesWith(LHS, ANDSNode); @@ -3501,7 +3504,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, } } - return DAG.getNode(Opcode, DL, DAG.getVTList(VT, MVT_CC), LHS, RHS) + return DAG.getNode(Opcode, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS) .getValue(1); } @@ -3597,7 +3600,7 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); - return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); + return DAG.getNode(Opcode, DL, FlagsVT, LHS, RHS, NZCVOp, Condition, CCOp); } /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be @@ -4036,7 +4039,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul); // Check that the result fits into a 32-bit integer. - SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC); + SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT); if (IsSigned) { // cmp xreg, wreg, sxtw SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value); @@ -4059,12 +4062,12 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { DAG.getConstant(63, DL, MVT::i64)); // It is important that LowerBits is last, otherwise the arithmetic // shift will not be folded into the compare (SUBS). - SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); + SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT); Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) .getValue(1); } else { SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); - SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); + SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT); Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, DL, MVT::i64), @@ -4075,7 +4078,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { } // switch (...) if (Opc) { - SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32); + SDVTList VTs = DAG.getVTList(Op->getValueType(0), FlagsVT); // Emit the AArch64 operation with overflow check. Value = DAG.getNode(Opc, DL, VTs, LHS, RHS); @@ -4177,7 +4180,7 @@ static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) { SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value; SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT); SDValue Cmp = - DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1); + DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Op0, Op1); return Cmp.getValue(1); } @@ -4220,16 +4223,15 @@ static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry); SDLoc DL(Op); - SDVTList VTs = DAG.getVTList(VT0, VT1); - SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS, + SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, FlagsVT), OpLHS, OpRHS, OpCarryIn); SDValue OutFlag = IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG) : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry); - return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag); + return DAG.getMergeValues({Sum, OutFlag}, DL); } static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { @@ -4254,8 +4256,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { Overflow = DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow); - SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); - return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Value, Overflow); + return DAG.getMergeValues({Value, Overflow}, DL); } // Prefetch operands are: @@ -7037,9 +7038,8 @@ SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op.getOperand(0)); // Generate SUBS & CSEL. - SDValue Cmp = - DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), - Op.getOperand(0), DAG.getConstant(0, DL, VT)); + SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), + Op.getOperand(0), DAG.getConstant(0, DL, VT)); return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg, DAG.getConstant(AArch64CC::PL, DL, MVT::i32), Cmp.getValue(1)); @@ -11108,7 +11108,7 @@ SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op, SDValue Carry = Op.getOperand(2); // SBCS uses a carry not a borrow so the carry flag should be inverted first. SDValue InvCarry = valueToCarryFlag(Carry, DAG, true); - SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue), + SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS, InvCarry); EVT OpVT = Op.getValueType(); @@ -12441,10 +12441,10 @@ SDValue AArch64TargetLowering::LowerAsmOutputForConstraint( // Get NZCV register. Only update chain when copyfrom is glued. if (Glue.getNode()) { - Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue); + Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT, Glue); Chain = Glue.getValue(1); } else - Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32); + Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT); // Extract CC code. SDValue CC = getSETCC(Cond, Glue, DL, DAG); @@ -17343,12 +17343,17 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) { /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) -bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, +bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store, + Value *LaneMask, ShuffleVectorInst *SVI, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); + auto *SI = dyn_cast<StoreInst>(Store); + if (!SI) + return false; + assert(!LaneMask && "Unexpected mask on store"); auto *VecTy = cast<FixedVectorType>(SVI->getType()); assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); @@ -18015,11 +18020,14 @@ bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask( unsigned ShlAmt = C2->getZExtValue(); if (auto ShouldADD = *N->user_begin(); ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) { - if (auto ShouldLOAD = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) { - unsigned ByteVT = ShouldLOAD->getMemoryVT().getSizeInBits() / 8; - if ((1ULL << ShlAmt) == ByteVT && - isIndexedLoadLegal(ISD::PRE_INC, ShouldLOAD->getMemoryVT())) - return false; + if (auto Load = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) { + EVT MemVT = Load->getMemoryVT(); + + if (Load->getValueType(0).isScalableVector()) + return (8ULL << ShlAmt) != MemVT.getScalarSizeInBits(); + + if (isIndexedLoadLegal(ISD::PRE_INC, MemVT)) + return (8ULL << ShlAmt) != MemVT.getFixedSizeInBits(); } } } @@ -18588,7 +18596,7 @@ AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor, Created.push_back(And.getNode()); } else { SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC); - SDVTList VTs = DAG.getVTList(VT, MVT::i32); + SDVTList VTs = DAG.getVTList(VT, FlagsVT); SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0); SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne); @@ -19477,10 +19485,10 @@ static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) { // can select to CCMN to avoid the extra mov SDValue AbsOp1 = DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0)); - CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1, - NZCVOp, Condition, Cmp0); + CCmp = DAG.getNode(AArch64ISD::CCMN, DL, FlagsVT, Cmp1.getOperand(0), + AbsOp1, NZCVOp, Condition, Cmp0); } else { - CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0), + CCmp = DAG.getNode(AArch64ISD::CCMP, DL, FlagsVT, Cmp1.getOperand(0), Cmp1.getOperand(1), NZCVOp, Condition, Cmp0); } return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0), @@ -25129,8 +25137,9 @@ static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG) { if (!TReassocOp && !FReassocOp) return SDValue(); - SDValue NewCmp = DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode), - DAG.getVTList(VT, MVT_CC), CmpOpOther, SubsOp); + SDValue NewCmp = + DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode), + DAG.getVTList(VT, FlagsVT), CmpOpOther, SubsOp); auto Reassociate = [&](SDValue ReassocOp, unsigned OpNum) { if (!ReassocOp) @@ -27156,7 +27165,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, : AArch64SysReg::RNDRRS); SDLoc DL(N); SDValue A = DAG.getNode( - AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::i32, MVT::Other), + AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other), N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32)); SDValue B = DAG.getNode( AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32), diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 713793e..d8403c2 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -215,7 +215,8 @@ public: ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const override; - bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, + bool lowerInterleavedStore(Instruction *Store, Value *Mask, + ShuffleVectorInst *SVI, unsigned Factor) const override; bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index bc57537..802e4a9 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -533,8 +533,9 @@ bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB, MBP.LHS = LastInst->getOperand(0); MBP.RHS = MachineOperand::CreateImm(0); - MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE - : MachineBranchPredicate::PRED_EQ; + MBP.Predicate = (LastOpc == AArch64::CBNZX || LastOpc == AArch64::CBNZW) + ? MachineBranchPredicate::PRED_NE + : MachineBranchPredicate::PRED_EQ; return false; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 9f8a257..9ebdf2e 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -430,26 +430,27 @@ def UseWzrToVecMove : Predicate<"Subtarget->useWzrToVecMove()">; def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, - SDTCisInt<0>, SDTCisVT<1, i32>]>; + SDTCisInt<0>, + SDTCisVT<1, FlagsVT>]>; // SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, - SDTCisVT<3, i32>]>; + SDTCisVT<3, FlagsVT>]>; // SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<0>, - SDTCisVT<1, i32>, - SDTCisVT<4, i32>]>; + SDTCisVT<1, FlagsVT>, + SDTCisVT<4, FlagsVT>]>; def SDT_AArch64Brcond : SDTypeProfile<0, 3, [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>, - SDTCisVT<2, i32>]>; + SDTCisVT<2, FlagsVT>]>; def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>; def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisVT<2, OtherVT>]>; @@ -458,22 +459,22 @@ def SDT_AArch64CSel : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<3>, - SDTCisVT<4, i32>]>; + SDTCisVT<4, FlagsVT>]>; def SDT_AArch64CCMP : SDTypeProfile<1, 5, - [SDTCisVT<0, i32>, + [SDTCisVT<0, FlagsVT>, SDTCisInt<1>, SDTCisSameAs<1, 2>, SDTCisInt<3>, SDTCisInt<4>, SDTCisVT<5, i32>]>; def SDT_AArch64FCCMP : SDTypeProfile<1, 5, - [SDTCisVT<0, i32>, + [SDTCisVT<0, FlagsVT>, SDTCisFP<1>, SDTCisSameAs<1, 2>, SDTCisInt<3>, SDTCisInt<4>, SDTCisVT<5, i32>]>; -def SDT_AArch64FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, +def SDT_AArch64FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, FlagsVT>, SDTCisFP<1>, SDTCisSameAs<2, 1>]>; def SDT_AArch64Rev : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>; @@ -1124,10 +1125,10 @@ def AArch64probedalloca SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, [SDNPHasChain, SDNPMayStore]>; -// MRS, also sets the flags via a glue. +// MRS, also sets the flags. def AArch64mrs : SDNode<"AArch64ISD::MRS", SDTypeProfile<2, 1, [SDTCisVT<0, i64>, - SDTCisVT<1, i32>, + SDTCisVT<1, FlagsVT>, SDTCisVT<2, i32>]>, [SDNPHasChain]>; @@ -3934,6 +3935,26 @@ defm LDRSW : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw", def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))), (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>; +// load zero-extended i32, bitcast to f64 +def : Pat <(f64 (bitconvert (i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), + (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; + +// load zero-extended i16, bitcast to f64 +def : Pat <(f64 (bitconvert (i64 (zextloadi16 (am_indexed32 GPR64sp:$Rn, uimm12s2:$offset))))), + (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; + +// load zero-extended i8, bitcast to f64 +def : Pat <(f64 (bitconvert (i64 (zextloadi8 (am_indexed32 GPR64sp:$Rn, uimm12s1:$offset))))), + (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; + +// load zero-extended i16, bitcast to f32 +def : Pat <(f32 (bitconvert (i32 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), + (SUBREG_TO_REG (i32 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; + +// load zero-extended i8, bitcast to f32 +def : Pat <(f32 (bitconvert (i32 (zextloadi8 (am_indexed16 GPR64sp:$Rn, uimm12s1:$offset))))), + (SUBREG_TO_REG (i32 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; + // Pre-fetch. def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm", [(AArch64Prefetch timm:$Rt, diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp index 0ddd17c..abcd550 100644 --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -8,8 +8,8 @@ // // This pass performs below peephole optimizations on MIR level. // -// 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri -// MOVi64imm + ANDXrr ==> ANDXri + ANDXri +// 1. MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri +// MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri // // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi // MOVi64imm + ADDXrr ==> ANDXri + ANDXri @@ -126,7 +126,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass { bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI); template <typename T> - bool visitAND(unsigned Opc, MachineInstr &MI); + bool visitAND(unsigned Opc, MachineInstr &MI, unsigned OtherOpc = 0); bool visitORR(MachineInstr &MI); bool visitCSEL(MachineInstr &MI); bool visitINSERT(MachineInstr &MI); @@ -194,12 +194,12 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { } template <typename T> -bool AArch64MIPeepholeOpt::visitAND( - unsigned Opc, MachineInstr &MI) { +bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI, + unsigned OtherOpc) { // Try below transformation. // - // MOVi32imm + ANDWrr ==> ANDWri + ANDWri - // MOVi64imm + ANDXrr ==> ANDXri + ANDXri + // MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri + // MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri // // The mov pseudo instruction could be expanded to multiple mov instructions // later. Let's try to split the constant operand of mov instruction into two @@ -208,10 +208,10 @@ bool AArch64MIPeepholeOpt::visitAND( return splitTwoPartImm<T>( MI, - [Opc](T Imm, unsigned RegSize, T &Imm0, - T &Imm1) -> std::optional<OpcodePair> { + [Opc, OtherOpc](T Imm, unsigned RegSize, T &Imm0, + T &Imm1) -> std::optional<OpcodePair> { if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1)) - return std::make_pair(Opc, Opc); + return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc); return std::nullopt; }, [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, @@ -864,6 +864,12 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { case AArch64::ANDXrr: Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI); break; + case AArch64::ANDSWrr: + Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI, AArch64::ANDSWri); + break; + case AArch64::ANDSXrr: + Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI, AArch64::ANDSXri); + break; case AArch64::ORRWrs: Changed |= visitORR(MI); break; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index 61bf87f..1a7609b 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -305,7 +305,8 @@ def GPR64pi48 : RegisterOperand<GPR64, "printPostIncOperand<48>">; def GPR64pi64 : RegisterOperand<GPR64, "printPostIncOperand<64>">; // Condition code regclass. -def CCR : RegisterClass<"AArch64", [i32], 32, (add NZCV)> { +defvar FlagsVT = i32; +def CCR : RegisterClass<"AArch64", [FlagsVT], 32, (add NZCV)> { let CopyCost = -1; // Don't allow copying of status registers. // CCR is not allocatable. diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index bafb8d0..8a5b5ba 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -32,10 +32,29 @@ AArch64SelectionDAGInfo::AArch64SelectionDAGInfo() void AArch64SelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG, const SDNode *N) const { + SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N); + #ifndef NDEBUG + // Some additional checks not yet implemented by verifyTargetNode. + constexpr MVT FlagsVT = MVT::i32; switch (N->getOpcode()) { - default: - return SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N); + case AArch64ISD::SUBS: + assert(N->getValueType(1) == FlagsVT); + break; + case AArch64ISD::ADC: + case AArch64ISD::SBC: + assert(N->getOperand(2).getValueType() == FlagsVT); + break; + case AArch64ISD::ADCS: + case AArch64ISD::SBCS: + assert(N->getValueType(1) == FlagsVT); + assert(N->getOperand(2).getValueType() == FlagsVT); + break; + case AArch64ISD::CSEL: + case AArch64ISD::CSINC: + case AArch64ISD::BRCOND: + assert(N->getOperand(3).getValueType() == FlagsVT); + break; case AArch64ISD::SADDWT: case AArch64ISD::SADDWB: case AArch64ISD::UADDWT: diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp index 75c7dd9..f136a184 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp @@ -581,7 +581,6 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { // statement if return_twice functions are called. bool StandardLifetime = !SInfo.CallsReturnTwice && - SInfo.UnrecognizedLifetimes.empty() && memtag::isStandardLifetime(Info.LifetimeStart, Info.LifetimeEnd, DT, LI, ClMaxLifetimes); if (StandardLifetime) { @@ -616,10 +615,5 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { memtag::annotateDebugRecords(Info, Tag); } - // If we have instrumented at least one alloca, all unrecognized lifetime - // intrinsics have to go. - for (auto *I : SInfo.UnrecognizedLifetimes) - I->eraseFromParent(); - return true; } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 2409cc8..0f4f012 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -534,7 +534,7 @@ unsigned AArch64Subtarget::classifyGlobalFunctionReference( } void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, - unsigned NumRegionInstrs) const { + const SchedRegion &Region) const { // LNT run (at least on Cyclone) showed reasonably significant gains for // bi-directional scheduling. 253.perlbmk. Policy.OnlyTopDown = false; diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 154db3c..061ed61 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -343,7 +343,8 @@ public: } void overrideSchedPolicy(MachineSchedPolicy &Policy, - unsigned NumRegionInstrs) const override; + const SchedRegion &Region) const override; + void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const override; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 90d3d92..40f49da 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -249,7 +249,7 @@ static bool hasPossibleIncompatibleOps(const Function *F) { return false; } -uint64_t AArch64TTIImpl::getFeatureMask(const Function &F) const { +APInt AArch64TTIImpl::getFeatureMask(const Function &F) const { StringRef AttributeStr = isMultiversionedFunction(F) ? "fmv-features" : "target-features"; StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString(); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index b27eb2e..7f45177 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -89,7 +89,7 @@ public: unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override; - uint64_t getFeatureMask(const Function &F) const override; + APInt getFeatureMask(const Function &F) const override; bool isMultiversionedFunction(const Function &F) const override; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp index 3d4a14b..1a9bce5 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp @@ -9,8 +9,6 @@ #include "AArch64MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCStreamer.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/ErrorHandling.h" using namespace llvm; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 6076ac4..8b8fc8b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -149,6 +149,12 @@ def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts", "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions" >; +def FeatureFmaMixBF16Insts : SubtargetFeature<"fma-mix-bf16-insts", + "HasFmaMixBF16Insts", + "true", + "Has v_fma_mix_f32_bf16, v_fma_mixlo_bf16, v_fma_mixhi_bf16 instructions" +>; + def FeatureIEEEMinimumMaximumInsts : SubtargetFeature<"ieee-minimum-maximum-insts", "HasIEEEMinimumMaximumInsts", "true", @@ -167,6 +173,12 @@ def FeatureMinimum3Maximum3F16 : SubtargetFeature<"minimum3-maximum3-f16", "Has v_minimum3_f16 and v_maximum3_f16 instructions" >; +def FeatureMin3Max3PKF16 : SubtargetFeature<"min3-max3-pkf16", + "HasMin3Max3PKF16", + "true", + "Has v_pk_min3_num_f16 and v_pk_max3_num_f16 instructions" +>; + def FeatureMinimum3Maximum3PKF16 : SubtargetFeature<"minimum3-maximum3-pkf16", "HasMinimum3Maximum3PKF16", "true", @@ -256,12 +268,24 @@ def FeatureInstFwdPrefetchBug : SubtargetFeature<"inst-fwd-prefetch-bug", "S_INST_PREFETCH instruction causes shader to hang" >; +def FeatureVmemPrefInsts : SubtargetFeature<"vmem-pref-insts", + "HasVmemPrefInsts", + "true", + "Has flat_prefect_b8 and global_prefetch_b8 instructions" +>; + def FeatureSafeSmemPrefetch : SubtargetFeature<"safe-smem-prefetch", "HasSafeSmemPrefetch", "true", "SMEM prefetches do not fail on illegal address" >; +def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch", + "HasSafeCUPrefetch", + "true", + "VMEM CU scope prefetches do not fail on illegal address" +>; + def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard", "HasVcmpxExecWARHazard", "true", @@ -559,6 +583,12 @@ def FeatureBF16ConversionInsts : SubtargetFeature<"bf16-cvt-insts", "Has bf16 conversion instructions" >; +def FeatureBF16PackedInsts : SubtargetFeature<"bf16-pk-insts", + "HasBF16PackedInsts", + "true", + "Has bf16 packed instructions (fma, add, mul, max, min)" +>; + def FeatureVOP3P : SubtargetFeature<"vop3p", "HasVOP3PInsts", "true", @@ -1349,6 +1379,10 @@ def FeatureLshlAddU64Inst : SubtargetFeature<"lshl-add-u64-inst", "HasLshlAddU64Inst", "true", "Has v_lshl_add_u64 instruction">; +def FeatureAddSubU64Insts + : SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true", + "Has v_add_u64 and v_sub_u64 instructions">; + def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts", "HasVMemToLDSLoad", "true", @@ -1989,7 +2023,10 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureTransposeLoadF4F6Insts, FeatureBF16TransInsts, FeatureBF16ConversionInsts, + FeatureBF16PackedInsts, FeatureCvtPkF16F32Inst, + FeatureFmaMixBF16Insts, + FeatureMin3Max3PKF16, FeatureMinimum3Maximum3PKF16, FeaturePrngInst, FeaturePermlane16Swap, @@ -2002,7 +2039,9 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureFlatBufferGlobalAtomicFaddF64Inst, FeatureMemoryAtomicFAddF32DenormalSupport, FeatureKernargPreload, + FeatureVmemPrefInsts, FeatureLshlAddU64Inst, + FeatureAddSubU64Insts, FeatureLdsBarrierArriveAtomic, FeatureSetPrioIncWgInst, ]>; @@ -2349,6 +2388,10 @@ def HasMinimum3Maximum3F16 : Predicate<"Subtarget->hasMinimum3Maximum3F16()">, AssemblerPredicate<(all_of FeatureMinimum3Maximum3F16)>; +def HasMin3Max3PKF16 : + Predicate<"Subtarget->hasMin3Max3PKF16()">, + AssemblerPredicate<(all_of FeatureMin3Max3PKF16)>; + def HasMinimum3Maximum3PKF16 : Predicate<"Subtarget->hasMinimum3Maximum3PKF16()">, AssemblerPredicate<(all_of FeatureMinimum3Maximum3PKF16)>; @@ -2472,6 +2515,9 @@ def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">, def HasBF16ConversionInsts : Predicate<"Subtarget->hasBF16ConversionInsts()">, AssemblerPredicate<(all_of FeatureBF16ConversionInsts)>; +def HasBF16PackedInsts : Predicate<"Subtarget->hasBF16PackedInsts()">, + AssemblerPredicate<(all_of FeatureBF16PackedInsts)>; + def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">, AssemblerPredicate<(all_of FeatureVOP3P)>; @@ -2519,6 +2565,14 @@ def HasFmaakFmamkF64Insts : Predicate<"Subtarget->hasFmaakFmamkF64Insts()">, AssemblerPredicate<(any_of FeatureGFX1250Insts)>; +def HasPkAddMinMaxInsts : + Predicate<"Subtarget->hasPkAddMinMaxInsts()">, + AssemblerPredicate<(any_of FeatureGFX1250Insts)>; + +def HasPkMinMax3Insts : + Predicate<"Subtarget->hasPkMinMax3Insts()">, + AssemblerPredicate<(any_of FeatureGFX1250Insts)>; + def HasImageInsts : Predicate<"Subtarget->hasImageInsts()">, AssemblerPredicate<(all_of FeatureImageInsts)>; @@ -2565,6 +2619,9 @@ def HasMovrel : Predicate<"Subtarget->hasMovrel()">, def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">, AssemblerPredicate<(all_of FeatureFmaMixInsts)>; +def HasFmaMixBF16Insts : Predicate<"Subtarget->hasFmaMixBF16Insts()">, + AssemblerPredicate<(all_of FeatureFmaMixBF16Insts)>; + def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">, AssemblerPredicate<(all_of FeatureDLInsts)>; @@ -2763,12 +2820,18 @@ def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">; def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">, AssemblerPredicate<(all_of FeatureXF32Insts)>; +def HasVmemPrefInsts : Predicate<"Subtarget->hasVmemPrefInsts()">, + AssemblerPredicate<(all_of FeatureVmemPrefInsts)>; + def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">, AssemblerPredicate<(all_of FeatureAshrPkInsts)>; def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">, AssemblerPredicate<(all_of FeatureLshlAddU64Inst)>; +def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">, + AssemblerPredicate<(all_of FeatureAddSubU64Insts)>; + def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">, AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index dedee46..49d8b44 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -1383,7 +1383,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID, &AANoAliasAddrSpace::ID, &AAAddressSpace::ID, - &AAIndirectCallInfo::ID, &AAInstanceInfo::ID}); + &AAIndirectCallInfo::ID}); AttributorConfig AC(CGUpdater); AC.IsClosedWorldModule = Options.IsClosedWorld; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 891d362..108842f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -446,5 +446,8 @@ def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">, def gi_as_hw_round_mode : GICustomOperandRenderer<"renderRoundMode">, GISDNodeXFormEquiv<as_hw_round_mode>; +def gi_prefetch_loc : GICustomOperandRenderer<"renderPrefetchLoc">, + GISDNodeXFormEquiv<PrefetchLoc>; + def gi_MFMALdScaleModifierOp : GICustomOperandRenderer<"renderScaledMAIIntrinsicOperand">, GISDNodeXFormEquiv<MFMALdScaleXForm>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp index 00979f4..f36935d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp @@ -117,45 +117,72 @@ static LLT getReadAnyLaneSplitTy(LLT Ty) { return LLT::scalar(32); } -static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc, - const RegisterBankInfo &RBI); - -static void unmergeReadAnyLane(MachineIRBuilder &B, - SmallVectorImpl<Register> &SgprDstParts, - LLT UnmergeTy, Register VgprSrc, - const RegisterBankInfo &RBI) { +template <typename ReadLaneFnTy> +static Register buildReadLane(MachineIRBuilder &, Register, + const RegisterBankInfo &, ReadLaneFnTy); + +template <typename ReadLaneFnTy> +static void +unmergeReadAnyLane(MachineIRBuilder &B, SmallVectorImpl<Register> &SgprDstParts, + LLT UnmergeTy, Register VgprSrc, const RegisterBankInfo &RBI, + ReadLaneFnTy BuildRL) { const RegisterBank *VgprRB = &RBI.getRegBank(AMDGPU::VGPRRegBankID); auto Unmerge = B.buildUnmerge({VgprRB, UnmergeTy}, VgprSrc); for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { - SgprDstParts.push_back(buildReadAnyLane(B, Unmerge.getReg(i), RBI)); + SgprDstParts.push_back(buildReadLane(B, Unmerge.getReg(i), RBI, BuildRL)); } } -static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc, - const RegisterBankInfo &RBI) { +template <typename ReadLaneFnTy> +static Register buildReadLane(MachineIRBuilder &B, Register VgprSrc, + const RegisterBankInfo &RBI, + ReadLaneFnTy BuildRL) { LLT Ty = B.getMRI()->getType(VgprSrc); const RegisterBank *SgprRB = &RBI.getRegBank(AMDGPU::SGPRRegBankID); if (Ty.getSizeInBits() == 32) { - return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {{SgprRB, Ty}}, {VgprSrc}) - .getReg(0); + Register SgprDst = B.getMRI()->createVirtualRegister({SgprRB, Ty}); + return BuildRL(B, SgprDst, VgprSrc).getReg(0); } SmallVector<Register, 8> SgprDstParts; - unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI); + unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI, + BuildRL); return B.buildMergeLikeInstr({SgprRB, Ty}, SgprDstParts).getReg(0); } -void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, - Register VgprSrc, const RegisterBankInfo &RBI) { +template <typename ReadLaneFnTy> +static void buildReadLane(MachineIRBuilder &B, Register SgprDst, + Register VgprSrc, const RegisterBankInfo &RBI, + ReadLaneFnTy BuildReadLane) { LLT Ty = B.getMRI()->getType(VgprSrc); if (Ty.getSizeInBits() == 32) { - B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc}); + BuildReadLane(B, SgprDst, VgprSrc); return; } SmallVector<Register, 8> SgprDstParts; - unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI); + unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI, + BuildReadLane); B.buildMergeLikeInstr(SgprDst, SgprDstParts).getReg(0); } + +void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, + Register VgprSrc, const RegisterBankInfo &RBI) { + return buildReadLane( + B, SgprDst, VgprSrc, RBI, + [](MachineIRBuilder &B, Register SgprDst, Register VgprSrc) { + return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc}); + }); +} + +void AMDGPU::buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, + Register VgprSrc, const RegisterBankInfo &RBI) { + return buildReadLane( + B, SgprDst, VgprSrc, RBI, + [](MachineIRBuilder &B, Register SgprDst, Register VgprSrc) { + return B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, SgprDst) + .addReg(VgprSrc); + }); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h index 0c89bb5..5e1000e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h @@ -51,6 +51,8 @@ private: void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI); +void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, + const RegisterBankInfo &RBI); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 00c7f0e..0ca2286 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1863,9 +1863,17 @@ bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr, SIInstrFlags::FlatScratch); } -// If this matches zero_extend i32:x, return x -static SDValue matchZExtFromI32(SDValue Op) { - if (Op.getOpcode() != ISD::ZERO_EXTEND) +// If this matches *_extend i32:x, return x +// Otherwise if the value is I32 returns x. +static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned, + const SelectionDAG *DAG) { + if (Op.getValueType() == MVT::i32) + return Op; + + if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) && + Op.getOpcode() != ISD::ANY_EXTEND && + !(DAG->SignBitIsZero(Op) && + Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND))) return SDValue(); SDValue ExtSrc = Op.getOperand(0); @@ -1873,12 +1881,13 @@ static SDValue matchZExtFromI32(SDValue Op) { } // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) -bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, - SDValue Addr, - SDValue &SAddr, - SDValue &VOffset, - SDValue &Offset) const { +// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset) +bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr, + SDValue &SAddr, SDValue &VOffset, + SDValue &Offset, bool &ScaleOffset, + bool NeedIOffset) const { int64_t ImmOffset = 0; + ScaleOffset = false; // Match the immediate offset first, which canonically is moved as low as // possible. @@ -1888,7 +1897,8 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue(); const SIInstrInfo *TII = Subtarget->getInstrInfo(); - if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, + if (NeedIOffset && + TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal)) { Addr = LHS; ImmOffset = COffsetVal; @@ -1898,11 +1908,14 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, // saddr + large_offset -> saddr + // (voffset = large_offset & ~MaxOffset) + // (large_offset & MaxOffset); - int64_t SplitImmOffset, RemainderOffset; - std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset( - COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); + int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal; + if (NeedIOffset) { + std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset( + COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); + } - if (isUInt<32>(RemainderOffset)) { + if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset) + : isUInt<32>(RemainderOffset)) { SDNode *VMov = CurDAG->getMachineNode( AMDGPU::V_MOV_B32_e32, SL, MVT::i32, CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); @@ -1929,21 +1942,26 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, // Match the variable offset. if (Addr.getOpcode() == ISD::ADD) { LHS = Addr.getOperand(0); - RHS = Addr.getOperand(1); if (!LHS->isDivergent()) { - // add (i64 sgpr), (zero_extend (i32 vgpr)) - if (SDValue ZextRHS = matchZExtFromI32(RHS)) { + // add (i64 sgpr), (*_extend (i32 vgpr)) + RHS = Addr.getOperand(1); + ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset()); + if (SDValue ExtRHS = matchExtFromI32orI32( + RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) { SAddr = LHS; - VOffset = ZextRHS; + VOffset = ExtRHS; } } + RHS = Addr.getOperand(1); if (!SAddr && !RHS->isDivergent()) { - // add (zero_extend (i32 vgpr)), (i64 sgpr) - if (SDValue ZextLHS = matchZExtFromI32(LHS)) { + // add (*_extend (i32 vgpr)), (i64 sgpr) + ScaleOffset = SelectScaleOffset(N, LHS, Subtarget->hasSignedGVSOffset()); + if (SDValue ExtLHS = matchExtFromI32orI32( + LHS, Subtarget->hasSignedGVSOffset(), CurDAG)) { SAddr = RHS; - VOffset = ZextLHS; + VOffset = ExtLHS; } } @@ -1953,6 +1971,27 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, } } + if (Subtarget->hasScaleOffset() && + (Addr.getOpcode() == (Subtarget->hasSignedGVSOffset() + ? AMDGPUISD::MAD_I64_I32 + : AMDGPUISD::MAD_U64_U32) || + (Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 && + CurDAG->SignBitIsZero(Addr.getOperand(0)))) && + Addr.getOperand(0)->isDivergent() && + isa<ConstantSDNode>(Addr.getOperand(1)) && + !Addr.getOperand(2)->isDivergent()) { + // mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr) + unsigned Size = + (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8; + ScaleOffset = Addr.getConstantOperandVal(1) == Size; + if (ScaleOffset) { + SAddr = Addr.getOperand(2); + VOffset = Addr.getOperand(0); + Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32); + return true; + } + } + if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF || isa<ConstantSDNode>(Addr)) return false; @@ -1972,10 +2011,12 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const { - if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset)) + bool ScaleOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset)) return false; - CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); + CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0, + SDLoc(), MVT::i32); return true; } @@ -1983,10 +2024,11 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const { - if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset)) + bool ScaleOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset)) return false; - unsigned CPolVal = AMDGPU::CPol::GLC; + unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC; CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32); return true; } @@ -2074,7 +2116,8 @@ bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug( bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &SAddr, - SDValue &Offset) const { + SDValue &Offset, + SDValue &CPol) const { int64_t ImmOffset = 0; SDValue LHS, RHS; @@ -2106,6 +2149,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset)) return false; Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32); + CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); return true; } } @@ -2139,6 +2183,10 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, return false; SAddr = SelectSAddrFI(CurDAG, SAddr); Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32); + + bool ScaleOffset = SelectScaleOffset(N, VAddr, true /* IsSigned */); + CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0, + SDLoc(), MVT::i32); return true; } @@ -2159,17 +2207,59 @@ bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset, return true; } +// Given \p Offset and load node \p N check if an \p Offset is a multiple of +// the load byte size. If it is update \p Offset to a pre-scaled value and +// return true. +bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset, + bool IsSigned) const { + bool ScaleOffset = false; + if (!Subtarget->hasScaleOffset() || !Offset) + return false; + + unsigned Size = + (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8; + + SDValue Off = Offset; + if (SDValue Ext = matchExtFromI32orI32(Offset, IsSigned, CurDAG)) + Off = Ext; + + if (isPowerOf2_32(Size) && Off.getOpcode() == ISD::SHL) { + if (auto *C = dyn_cast<ConstantSDNode>(Off.getOperand(1))) + ScaleOffset = C->getZExtValue() == Log2_32(Size); + } else if (Offset.getOpcode() == ISD::MUL || + (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) || + Offset.getOpcode() == AMDGPUISD::MUL_U24 || + (Offset.isMachineOpcode() && + Offset.getMachineOpcode() == + (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO + : AMDGPU::S_MUL_U64_U32_PSEUDO))) { + if (auto *C = dyn_cast<ConstantSDNode>(Offset.getOperand(1))) + ScaleOffset = C->getZExtValue() == Size; + } + + if (ScaleOffset) + Offset = Off.getOperand(0); + + return ScaleOffset; +} + // Match an immediate (if Offset is not null) or an SGPR (if SOffset is // not null) offset. If Imm32Only is true, match only 32-bit immediate // offsets available on CI. -bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, +bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode, SDValue *SOffset, SDValue *Offset, bool Imm32Only, bool IsBuffer, - bool HasSOffset, - int64_t ImmOffset) const { + bool HasSOffset, int64_t ImmOffset, + bool *ScaleOffset) const { assert((!SOffset || !Offset) && "Cannot match both soffset and offset at the same time!"); + if (ScaleOffset) { + assert(N && SOffset); + + *ScaleOffset = SelectScaleOffset(N, ByteOffsetNode, false /* IsSigned */); + } + ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode); if (!C) { if (!SOffset) @@ -2254,24 +2344,25 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const { // Match a base and an immediate (if Offset is not null) or an SGPR (if // SOffset is not null) or an immediate+SGPR offset. If Imm32Only is // true, match only 32-bit immediate offsets available on CI. -bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, - SDValue *SOffset, SDValue *Offset, - bool Imm32Only, bool IsBuffer, - bool HasSOffset, - int64_t ImmOffset) const { +bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr, + SDValue &SBase, SDValue *SOffset, + SDValue *Offset, bool Imm32Only, + bool IsBuffer, bool HasSOffset, + int64_t ImmOffset, + bool *ScaleOffset) const { if (SOffset && Offset) { assert(!Imm32Only && !IsBuffer); SDValue B; - if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true)) + if (!SelectSMRDBaseOffset(N, Addr, B, nullptr, Offset, false, false, true)) return false; int64_t ImmOff = 0; if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset)) ImmOff = C->getSExtValue(); - return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true, - ImmOff); + return SelectSMRDBaseOffset(N, B, SBase, SOffset, nullptr, false, false, + true, ImmOff, ScaleOffset); } // A 32-bit (address + offset) should not cause unsigned 32-bit integer @@ -2291,23 +2382,25 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, if (!N0 || !N1) return false; - if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, - ImmOffset)) { + if (SelectSMRDOffset(N, N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, + ImmOffset, ScaleOffset)) { SBase = N0; return true; } - if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, - ImmOffset)) { + if (SelectSMRDOffset(N, N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, + ImmOffset, ScaleOffset)) { SBase = N1; return true; } return false; } -bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, +bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase, SDValue *SOffset, SDValue *Offset, - bool Imm32Only) const { - if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) { + bool Imm32Only, bool *ScaleOffset) const { + if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only, + /* IsBuffer */ false, /* HasSOffset */ false, + /* ImmOffset */ 0, ScaleOffset)) { SBase = Expand32BitAddress(SBase); return true; } @@ -2323,36 +2416,51 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const { - return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset); + return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr, + &Offset); } bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const { assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); - return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset, - /* Imm32Only */ true); + return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr, + &Offset, /* Imm32Only */ true); } -bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase, - SDValue &SOffset) const { - return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr); +bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase, + SDValue &SOffset, SDValue &CPol) const { + bool ScaleOffset; + if (!SelectSMRD(N, Addr, SBase, &SOffset, /* Offset */ nullptr, + /* Imm32Only */ false, &ScaleOffset)) + return false; + + CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0, + SDLoc(N), MVT::i32); + return true; } -bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase, - SDValue &SOffset, - SDValue &Offset) const { - return SelectSMRD(Addr, SBase, &SOffset, &Offset); +bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr, + SDValue &SBase, SDValue &SOffset, + SDValue &Offset, + SDValue &CPol) const { + bool ScaleOffset; + if (!SelectSMRD(N, Addr, SBase, &SOffset, &Offset, false, &ScaleOffset)) + return false; + + CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0, + SDLoc(N), MVT::i32); + return true; } bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const { - return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset, + return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset, /* Imm32Only */ false, /* IsBuffer */ true); } bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const { assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); - return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset, + return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset, /* Imm32Only */ true, /* IsBuffer */ true); } @@ -2361,9 +2469,9 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset, // Match the (soffset + offset) pair as a 32-bit register base and // an immediate offset. return N.getValueType() == MVT::i32 && - SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr, - &Offset, /* Imm32Only */ false, - /* IsBuffer */ true); + SelectSMRDBaseOffset(/* N */ nullptr, N, /* SBase */ SOffset, + /* SOffset*/ nullptr, &Offset, + /* Imm32Only */ false, /* IsBuffer */ true); } bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index, @@ -3753,58 +3861,114 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src, return SelectVOP3Mods(In, Src, SrcMods); } +// Match lowered fpext from bf16 to f32. This is a bit operation extending +// a 16-bit value with 16-bit of zeroes at LSB: +// +// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val))))) +// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true +// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false +static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) { + if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST) + return SDValue(); + Op = Op.getOperand(0); + + IsExtractHigh = false; + if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) { + auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0)); + if (!Low16 || !Low16->isZero()) + return SDValue(); + Op = stripBitcast(Op.getOperand(1)); + if (Op.getValueType() != MVT::bf16) + return SDValue(); + return Op; + } + + if (Op.getValueType() != MVT::i32) + return SDValue(); + + if (Op.getOpcode() == ISD::AND) { + if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + if (Mask->getZExtValue() == 0xffff0000) { + IsExtractHigh = true; + return Op.getOperand(0); + } + } + return SDValue(); + } + + if (Op.getOpcode() == ISD::SHL) { + if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + if (Amt->getZExtValue() == 16) + return Op.getOperand(0); + } + } + + return SDValue(); +} + // The return value is not whether the match is possible (which it always is), // but whether or not it a conversion is really used. bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, - unsigned &Mods) const { + unsigned &Mods, + MVT VT) const { Mods = 0; SelectVOP3ModsImpl(In, Src, Mods); + bool IsExtractHigh = false; if (Src.getOpcode() == ISD::FP_EXTEND) { Src = Src.getOperand(0); - assert(Src.getValueType() == MVT::f16); - Src = stripBitcast(Src); + } else if (VT == MVT::bf16) { + SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh); + if (!B16) + return false; + Src = B16; + } else + return false; - // Be careful about folding modifiers if we already have an abs. fneg is - // applied last, so we don't want to apply an earlier fneg. - if ((Mods & SISrcMods::ABS) == 0) { - unsigned ModsTmp; - SelectVOP3ModsImpl(Src, Src, ModsTmp); + if (Src.getValueType() != VT && + (VT != MVT::bf16 || Src.getValueType() != MVT::i32)) + return false; - if ((ModsTmp & SISrcMods::NEG) != 0) - Mods ^= SISrcMods::NEG; + Src = stripBitcast(Src); - if ((ModsTmp & SISrcMods::ABS) != 0) - Mods |= SISrcMods::ABS; - } + // Be careful about folding modifiers if we already have an abs. fneg is + // applied last, so we don't want to apply an earlier fneg. + if ((Mods & SISrcMods::ABS) == 0) { + unsigned ModsTmp; + SelectVOP3ModsImpl(Src, Src, ModsTmp); - // op_sel/op_sel_hi decide the source type and source. - // If the source's op_sel_hi is set, it indicates to do a conversion from fp16. - // If the sources's op_sel is set, it picks the high half of the source - // register. + if ((ModsTmp & SISrcMods::NEG) != 0) + Mods ^= SISrcMods::NEG; - Mods |= SISrcMods::OP_SEL_1; - if (isExtractHiElt(Src, Src)) { - Mods |= SISrcMods::OP_SEL_0; + if ((ModsTmp & SISrcMods::ABS) != 0) + Mods |= SISrcMods::ABS; + } - // TODO: Should we try to look for neg/abs here? - } + // op_sel/op_sel_hi decide the source type and source. + // If the source's op_sel_hi is set, it indicates to do a conversion from + // fp16. If the sources's op_sel is set, it picks the high half of the source + // register. - // Prevent unnecessary subreg COPY to VGPR_16 - if (Src.getOpcode() == ISD::TRUNCATE && - Src.getOperand(0).getValueType() == MVT::i32) { - Src = Src.getOperand(0); - } - return true; + Mods |= SISrcMods::OP_SEL_1; + if (IsExtractHigh || + (Src.getValueSizeInBits() == 16 && isExtractHiElt(Src, Src))) { + Mods |= SISrcMods::OP_SEL_0; + + // TODO: Should we try to look for neg/abs here? } - return false; + // Prevent unnecessary subreg COPY to VGPR_16 + if (Src.getOpcode() == ISD::TRUNCATE && + Src.getOperand(0).getValueType() == MVT::i32) { + Src = Src.getOperand(0); + } + return true; } bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; - if (!SelectVOP3PMadMixModsImpl(In, Src, Mods)) + if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16)) return false; SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; @@ -3813,7 +3977,24 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src, bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; - SelectVOP3PMadMixModsImpl(In, Src, Mods); + SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16); + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16)) + return false; + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16); SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index acbab3d..a6ce745 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -19,6 +19,7 @@ #include "SIModeRegisterDefaults.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/Support/AMDGPUAddrSpace.h" #include "llvm/Target/TargetMachine.h" namespace llvm { @@ -162,7 +163,8 @@ private: bool SelectScratchOffset(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &Offset) const; bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, - SDValue &VOffset, SDValue &Offset) const; + SDValue &VOffset, SDValue &Offset, bool &ScaleOffset, + bool NeedIOffset = true) const; bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const; @@ -174,24 +176,31 @@ private: bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const; bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr, - SDValue &SAddr, SDValue &Offset) const; + SDValue &SAddr, SDValue &Offset, + SDValue &CPol) const; - bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset, + bool SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode, SDValue *SOffset, SDValue *Offset, bool Imm32Only = false, bool IsBuffer = false, bool HasSOffset = false, - int64_t ImmOffset = 0) const; + int64_t ImmOffset = 0, + bool *ScaleOffset = nullptr) const; SDValue Expand32BitAddress(SDValue Addr) const; - bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset, - SDValue *Offset, bool Imm32Only = false, - bool IsBuffer = false, bool HasSOffset = false, - int64_t ImmOffset = 0) const; - bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset, - SDValue *Offset, bool Imm32Only = false) const; + bool SelectSMRDBaseOffset(SDNode *N, SDValue Addr, SDValue &SBase, + SDValue *SOffset, SDValue *Offset, + bool Imm32Only = false, bool IsBuffer = false, + bool HasSOffset = false, int64_t ImmOffset = 0, + bool *ScaleOffset = nullptr) const; + bool SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase, SDValue *SOffset, + SDValue *Offset, bool Imm32Only = false, + bool *ScaleOffset = nullptr) const; bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const; - bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &SOffset) const; - bool SelectSMRDSgprImm(SDValue Addr, SDValue &SBase, SDValue &SOffset, - SDValue &Offset) const; + bool SelectScaleOffset(SDNode *N, SDValue &Offset, bool IsSigned) const; + bool SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase, SDValue &SOffset, + SDValue &CPol) const; + bool SelectSMRDSgprImm(SDNode *N, SDValue Addr, SDValue &SBase, + SDValue &SOffset, SDValue &Offset, + SDValue &CPol) const; bool SelectSMRDBufferImm(SDValue N, SDValue &Offset) const; bool SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const; bool SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset, @@ -246,11 +255,15 @@ private: bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; - bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, - unsigned &Mods) const; + bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods, + MVT VT) const; bool SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src, + SDValue &SrcMods) const; + bool SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src, + SDValue &SrcMods) const; bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2, SDValue &Tbl) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 8975486..8ca9a97 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3494,25 +3494,74 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { } /// Match a zero extend from a 32-bit value to 64-bits. -static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { +Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const { Register ZExtSrc; - if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc)))) - return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register(); + if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc)))) + return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register(); // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0) - const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); + const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) return Register(); assert(Def->getNumOperands() == 3 && - MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64)); - if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) { + MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64)); + if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) { return Def->getOperand(1).getReg(); } return Register(); } +/// Match a sign extend from a 32-bit value to 64-bits. +Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const { + Register SExtSrc; + if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc)))) + return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register(); + + // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31)) + const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); + if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) + return Register(); + + assert(Def->getNumOperands() == 3 && + MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64)); + if (mi_match(Def->getOperand(2).getReg(), *MRI, + m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()), + m_SpecificICst(31)))) + return Def->getOperand(1).getReg(); + + if (VT->signBitIsZero(Reg)) + return matchZeroExtendFromS32(Reg); + + return Register(); +} + +/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it +/// is 32-bit. +Register +AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const { + return MRI->getType(Reg) == LLT::scalar(32) ? Reg + : matchZeroExtendFromS32(Reg); +} + +/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it +/// is 32-bit. +Register +AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const { + return MRI->getType(Reg) == LLT::scalar(32) ? Reg + : matchSignExtendFromS32(Reg); +} + +Register +AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg, + bool IsSigned) const { + if (IsSigned) + return matchSignExtendFromS32OrS32(Reg); + + return matchZeroExtendFromS32OrS32(Reg); +} + Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const { Register AnyExtSrc; if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc)))) @@ -3581,7 +3630,7 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); if (isSGPR(SAddr)) { Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); - if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) { + if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) { Addr = SAddr; VOffset = Off; } @@ -5223,7 +5272,7 @@ AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const { getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg(); unsigned Key = 0; - Register S32 = matchZeroExtendFromS32(*MRI, Src); + Register S32 = matchZeroExtendFromS32(Src); if (!S32) S32 = matchAnyExtendFromS32(Src); @@ -5296,10 +5345,68 @@ AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const { }}; } +// Given \p Offset and load specified by the \p Root operand check if \p Offset +// is a multiple of the load byte size. If it is update \p Offset to a +// pre-scaled value and return true. +bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root, + Register &Offset, + bool IsSigned) const { + if (!Subtarget->hasScaleOffset()) + return false; + + const MachineInstr &MI = *Root.getParent(); + MachineMemOperand *MMO = *MI.memoperands_begin(); + + if (!MMO->getSize().hasValue()) + return false; + + uint64_t Size = MMO->getSize().getValue(); + + Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned); + if (!OffsetReg) + OffsetReg = Offset; + + if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI)) + OffsetReg = Def->Reg; + + Register Op0; + MachineInstr *Mul; + bool ScaleOffset = + (isPowerOf2_64(Size) && + mi_match(OffsetReg, *MRI, + m_GShl(m_Reg(Op0), + m_any_of(m_SpecificICst(Log2_64(Size)), + m_Copy(m_SpecificICst(Log2_64(Size))))))) || + mi_match(OffsetReg, *MRI, + m_GMul(m_Reg(Op0), m_any_of(m_SpecificICst(Size), + m_Copy(m_SpecificICst(Size))))) || + mi_match( + OffsetReg, *MRI, + m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64, + m_Reg(Op0), m_SpecificICst(Size))) || + // Match G_AMDGPU_MAD_U64_U32 offset, c, 0 + (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) && + (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32 + : AMDGPU::G_AMDGPU_MAD_U64_U32) || + (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 && + VT->signBitIsZero(Mul->getOperand(2).getReg()))) && + mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) && + mi_match(Mul->getOperand(3).getReg(), *MRI, + m_GTrunc(m_any_of(m_SpecificICst(Size), + m_Copy(m_SpecificICst(Size))))) && + mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0))); + + if (ScaleOffset) + Offset = Op0; + + return ScaleOffset; +} + bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset, - int64_t *Offset) const { + int64_t *Offset, + bool *ScaleOffset) const { MachineInstr *MI = Root.getParent(); MachineBasicBlock *MBB = MI->getParent(); @@ -5314,6 +5421,9 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, const GEPInfo &GEPI = AddrInfo[0]; std::optional<int64_t> EncodedImm; + if (ScaleOffset) + *ScaleOffset = false; + if (SOffset && Offset) { EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false, /*HasSOffset=*/true); @@ -5321,8 +5431,12 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, AddrInfo.size() > 1) { const GEPInfo &GEPI2 = AddrInfo[1]; if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) { - if (Register OffsetReg = - matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) { + Register OffsetReg = GEPI2.SgprParts[1]; + if (ScaleOffset) + *ScaleOffset = + selectScaleOffset(Root, OffsetReg, false /* IsSigned */); + OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg); + if (OffsetReg) { Base = GEPI2.SgprParts[0]; *SOffset = OffsetReg; *Offset = *EncodedImm; @@ -5367,7 +5481,11 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, } if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) { - if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) { + Register OffsetReg = GEPI.SgprParts[1]; + if (ScaleOffset) + *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */); + OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg); + if (OffsetReg) { Base = GEPI.SgprParts[0]; *SOffset = OffsetReg; return true; @@ -5381,7 +5499,8 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { Register Base; int64_t Offset; - if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset)) + if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset, + /* ScaleOffset */ nullptr)) return std::nullopt; return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, @@ -5412,23 +5531,30 @@ AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { Register Base, SOffset; - if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr)) + bool ScaleOffset; + if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr, + &ScaleOffset)) return std::nullopt; + unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0; return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, - [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}}; + [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}}; } InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const { Register Base, SOffset; int64_t Offset; - if (!selectSmrdOffset(Root, Base, &SOffset, &Offset)) + bool ScaleOffset; + if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset)) return std::nullopt; + unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0; return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}}; + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}}; } std::pair<Register, int> @@ -5490,7 +5616,8 @@ AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const { // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, - unsigned CPolBits) const { + unsigned CPolBits, + bool NeedIOffset) const { Register Addr = Root.getReg(); Register PtrBase; int64_t ConstOffset; @@ -5501,7 +5628,8 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); if (ConstOffset != 0) { - if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, + if (NeedIOffset && + TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal)) { Addr = PtrBase; ImmOffset = ConstOffset; @@ -5514,11 +5642,15 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, // saddr + large_offset -> saddr + // (voffset = large_offset & ~MaxOffset) + // (large_offset & MaxOffset); - int64_t SplitImmOffset, RemainderOffset; - std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset( - ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); + int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset; + if (NeedIOffset) { + std::tie(SplitImmOffset, RemainderOffset) = + TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, + SIInstrFlags::FlatGlobal); + } - if (isUInt<32>(RemainderOffset)) { + if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset) + : isUInt<32>(RemainderOffset)) { MachineInstr *MI = Root.getParent(); MachineBasicBlock *MBB = MI->getParent(); Register HighBits = @@ -5528,12 +5660,22 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, HighBits) .addImm(RemainderOffset); + if (NeedIOffset) + return {{ + [=](MachineInstrBuilder &MIB) { + MIB.addReg(PtrBase); + }, // saddr + [=](MachineInstrBuilder &MIB) { + MIB.addReg(HighBits); + }, // voffset + [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); }, + }}; return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr [=](MachineInstrBuilder &MIB) { MIB.addReg(HighBits); }, // voffset - [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); }, }}; } @@ -5565,18 +5707,33 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, // It's possible voffset is an SGPR here, but the copy to VGPR will be // inserted later. - if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) { + bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset, + Subtarget->hasSignedGVSOffset()); + if (Register VOffset = matchExtendFromS32OrS32( + PtrBaseOffset, Subtarget->hasSignedGVSOffset())) { + if (NeedIOffset) + return {{[=](MachineInstrBuilder &MIB) { // saddr + MIB.addReg(SAddr); + }, + [=](MachineInstrBuilder &MIB) { // voffset + MIB.addReg(VOffset); + }, + [=](MachineInstrBuilder &MIB) { // offset + MIB.addImm(ImmOffset); + }, + [=](MachineInstrBuilder &MIB) { // cpol + MIB.addImm(CPolBits | + (ScaleOffset ? AMDGPU::CPol::SCAL : 0)); + }}}; return {{[=](MachineInstrBuilder &MIB) { // saddr MIB.addReg(SAddr); }, [=](MachineInstrBuilder &MIB) { // voffset MIB.addReg(VOffset); }, - [=](MachineInstrBuilder &MIB) { // offset - MIB.addImm(ImmOffset); - }, [=](MachineInstrBuilder &MIB) { // cpol - MIB.addImm(CPolBits); + MIB.addImm(CPolBits | + (ScaleOffset ? AMDGPU::CPol::SCAL : 0)); }}}; } } @@ -5597,10 +5754,16 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset) .addImm(0); + if (NeedIOffset) + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr + [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset + [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol + }}; return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset - [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol }}; } @@ -5732,22 +5895,32 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset)) return std::nullopt; + unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */) + ? AMDGPU::CPol::SCAL + : 0; + if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) { int FI = LHSDef->MI->getOperand(1).getIndex(); return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr + [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr - [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset + [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol }}; } if (!isSGPR(LHS)) + if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI)) + LHS = Def->Reg; + + if (!isSGPR(LHS)) return std::nullopt; return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr - [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr - [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset + [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr + [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr + [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol }}; } @@ -6895,6 +7068,17 @@ void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB, MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4); } +void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + uint32_t V = MI.getOperand(2).getImm(); + V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK)) + << AMDGPU::CPol::SCOPE_SHIFT; + if (!Subtarget->hasSafeCUPrefetch()) + V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe + MIB.addImm(V); +} + /// Convert from 2-bit value to enum values used for op_sel* source modifiers. void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 34bdf0a..61d9de1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -232,8 +232,10 @@ private: InstructionSelector::ComplexRendererFns selectVINTERPModsHi(MachineOperand &Root) const; + bool selectScaleOffset(MachineOperand &Root, Register &Offset, + bool IsSigned) const; bool selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset, - int64_t *Offset) const; + int64_t *Offset, bool *ScaleOffset) const; InstructionSelector::ComplexRendererFns selectSmrdImm(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns @@ -254,7 +256,8 @@ private: selectScratchOffset(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns - selectGlobalSAddr(MachineOperand &Root, unsigned CPolBits) const; + selectGlobalSAddr(MachineOperand &Root, unsigned CPolBits, + bool NeedIOffset = true) const; InstructionSelector::ComplexRendererFns selectGlobalSAddr(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns @@ -411,6 +414,10 @@ private: void renderRoundMode(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; + + void renderPrefetchLoc(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; + void renderScaledMAIIntrinsicOperand(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; @@ -421,6 +428,19 @@ private: // shift amount operand's `ShAmtBits` bits is unneeded. bool isUnneededShiftMask(const MachineInstr &MI, unsigned ShAmtBits) const; + /// Match a zero extend from a 32-bit value to 64-bits. + Register matchZeroExtendFromS32(Register Reg) const; + /// Match a sign extend from a 32-bit value to 64-bits. + Register matchSignExtendFromS32(Register Reg) const; + /// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it + /// is 32-bit. + Register matchZeroExtendFromS32OrS32(Register Reg) const; + /// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it + /// is 32-bit. + Register matchSignExtendFromS32OrS32(Register Reg) const; + /// Match either sign or zero extend depending on the \p IsSigned from a + /// 32-bit value to 64-bits, or \p Reg itself if it is 32-bit. + Register matchExtendFromS32OrS32(Register Reg, bool IsSigned) const; /// Match an any extend from a 32-bit value to 64-bit. Register matchAnyExtendFromS32(Register Reg) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index e7bf88d..fedfa3f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4208,6 +4208,9 @@ bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, assert(Ty.isScalar()); unsigned Size = Ty.getSizeInBits(); + if (ST.hasVectorMulU64() && Size == 64) + return true; + unsigned NumParts = Size / 32; assert((Size % 32) == 0); assert(NumParts >= 2); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp index ba66134..e187959 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp @@ -23,6 +23,8 @@ #include "GCNSubtarget.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineUniformityAnalysis.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -115,126 +117,233 @@ public: VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}; - bool isLaneMask(Register Reg) { - const RegisterBank *RB = MRI.getRegBankOrNull(Reg); - if (RB && RB->getID() == AMDGPU::VCCRegBankID) - return true; + bool isLaneMask(Register Reg); + std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode); + std::pair<GUnmerge *, int> tryMatchRALFromUnmerge(Register Src); + Register getReadAnyLaneSrc(Register Src); + void replaceRegWithOrBuildCopy(Register Dst, Register Src); - const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); - return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); - } + bool tryEliminateReadAnyLane(MachineInstr &Copy); + void tryCombineCopy(MachineInstr &MI); + void tryCombineS1AnyExt(MachineInstr &MI); +}; - void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) { - MI.eraseFromParent(); - if (Optional0 && isTriviallyDead(*Optional0, MRI)) - Optional0->eraseFromParent(); - } +bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) { + const RegisterBank *RB = MRI.getRegBankOrNull(Reg); + if (RB && RB->getID() == AMDGPU::VCCRegBankID) + return true; - std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode) { - MachineInstr *MatchMI = MRI.getVRegDef(Src); - if (MatchMI->getOpcode() != Opcode) - return {nullptr, Register()}; - return {MatchMI, MatchMI->getOperand(1).getReg()}; - } + const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); + return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); +} - void tryCombineCopy(MachineInstr &MI) { - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - // Skip copies of physical registers. - if (!Dst.isVirtual() || !Src.isVirtual()) - return; - - // This is a cross bank copy, sgpr S1 to lane mask. - // - // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32) - // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1) - // -> - // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32) - if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) { - auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC); - assert(Trunc && MRI.getType(TruncS32Src) == S32 && - "sgpr S1 must be result of G_TRUNC of sgpr S32"); - - B.setInstr(MI); - // Ensure that truncated bits in BoolSrc are 0. - auto One = B.buildConstant({SgprRB, S32}, 1); - auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One); - B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc}); - cleanUpAfterCombine(MI, Trunc); - return; - } +std::pair<MachineInstr *, Register> +AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) { + MachineInstr *MatchMI = MRI.getVRegDef(Src); + if (MatchMI->getOpcode() != Opcode) + return {nullptr, Register()}; + return {MatchMI, MatchMI->getOperand(1).getReg()}; +} + +std::pair<GUnmerge *, int> +AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) { + MachineInstr *ReadAnyLane = MRI.getVRegDef(Src); + if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE) + return {nullptr, -1}; + + Register RALSrc = ReadAnyLane->getOperand(1).getReg(); + if (auto *UnMerge = getOpcodeDef<GUnmerge>(RALSrc, MRI)) + return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)}; - // Src = G_AMDGPU_READANYLANE RALSrc - // Dst = COPY Src - // -> - // Dst = RALSrc - if (MRI.getRegBankOrNull(Dst) == VgprRB && - MRI.getRegBankOrNull(Src) == SgprRB) { - auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); - if (!RAL) - return; - - assert(MRI.getRegBank(RALSrc) == VgprRB); - MRI.replaceRegWith(Dst, RALSrc); - cleanUpAfterCombine(MI, RAL); - return; + return {nullptr, -1}; +} + +Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) { + // Src = G_AMDGPU_READANYLANE RALSrc + auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); + if (RAL) + return RALSrc; + + // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc + // LoSgpr = G_AMDGPU_READANYLANE LoVgpr + // HiSgpr = G_AMDGPU_READANYLANE HiVgpr + // Src G_MERGE_VALUES LoSgpr, HiSgpr + auto *Merge = getOpcodeDef<GMergeLikeInstr>(Src, MRI); + if (Merge) { + unsigned NumElts = Merge->getNumSources(); + auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0)); + if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0) + return {}; + + // Check if all elements are from same unmerge and there is no shuffling. + for (unsigned i = 1; i < NumElts; ++i) { + auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i)); + if (UnmergeI != Unmerge || (unsigned)IdxI != i) + return {}; } + return Unmerge->getSourceReg(); } - void tryCombineS1AnyExt(MachineInstr &MI) { - // %Src:sgpr(S1) = G_TRUNC %TruncSrc - // %Dst = G_ANYEXT %Src:sgpr(S1) - // -> - // %Dst = G_... %TruncSrc - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - if (MRI.getType(Src) != S1) - return; - - auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC); - if (!Trunc) - return; - - LLT DstTy = MRI.getType(Dst); - LLT TruncSrcTy = MRI.getType(TruncSrc); - - if (DstTy == TruncSrcTy) { - MRI.replaceRegWith(Dst, TruncSrc); - cleanUpAfterCombine(MI, Trunc); - return; - } + // SrcRegIdx = G_AMDGPU_READANYLANE RALElSrc + // SourceReg G_MERGE_VALUES ..., SrcRegIdx, ... + // ..., Src, ... = G_UNMERGE_VALUES SourceReg + auto *UnMerge = getOpcodeDef<GUnmerge>(Src, MRI); + if (!UnMerge) + return {}; + + int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr); + Merge = getOpcodeDef<GMergeLikeInstr>(UnMerge->getSourceReg(), MRI); + if (!Merge || UnMerge->getNumDefs() != Merge->getNumSources()) + return {}; + + Register SrcRegIdx = Merge->getSourceReg(Idx); + if (MRI.getType(Src) != MRI.getType(SrcRegIdx)) + return {}; + + auto [RALEl, RALElSrc] = tryMatch(SrcRegIdx, AMDGPU::G_AMDGPU_READANYLANE); + if (RALEl) + return RALElSrc; + + return {}; +} + +void AMDGPURegBankLegalizeCombiner::replaceRegWithOrBuildCopy(Register Dst, + Register Src) { + if (Dst.isVirtual()) + MRI.replaceRegWith(Dst, Src); + else + B.buildCopy(Dst, Src); +} + +bool AMDGPURegBankLegalizeCombiner::tryEliminateReadAnyLane( + MachineInstr &Copy) { + Register Dst = Copy.getOperand(0).getReg(); + Register Src = Copy.getOperand(1).getReg(); + + // Skip non-vgpr Dst + if (Dst.isVirtual() ? (MRI.getRegBankOrNull(Dst) != VgprRB) + : !TRI.isVGPR(MRI, Dst)) + return false; + + // Skip physical source registers and source registers with register class + if (!Src.isVirtual() || MRI.getRegClassOrNull(Src)) + return false; + + Register RALDst = Src; + MachineInstr &SrcMI = *MRI.getVRegDef(Src); + if (SrcMI.getOpcode() == AMDGPU::G_BITCAST) + RALDst = SrcMI.getOperand(1).getReg(); + + Register RALSrc = getReadAnyLaneSrc(RALDst); + if (!RALSrc) + return false; + + B.setInstr(Copy); + if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) { + // Src = READANYLANE RALSrc Src = READANYLANE RALSrc + // Dst = Copy Src $Dst = Copy Src + // -> -> + // Dst = RALSrc $Dst = Copy RALSrc + replaceRegWithOrBuildCopy(Dst, RALSrc); + } else { + // RALDst = READANYLANE RALSrc RALDst = READANYLANE RALSrc + // Src = G_BITCAST RALDst Src = G_BITCAST RALDst + // Dst = Copy Src Dst = Copy Src + // -> -> + // NewVgpr = G_BITCAST RALDst NewVgpr = G_BITCAST RALDst + // Dst = NewVgpr $Dst = Copy NewVgpr + auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc); + replaceRegWithOrBuildCopy(Dst, Bitcast.getReg(0)); + } + + eraseInstr(Copy, MRI); + return true; +} + +void AMDGPURegBankLegalizeCombiner::tryCombineCopy(MachineInstr &MI) { + if (tryEliminateReadAnyLane(MI)) + return; + + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + // Skip copies of physical registers. + if (!Dst.isVirtual() || !Src.isVirtual()) + return; + + // This is a cross bank copy, sgpr S1 to lane mask. + // + // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32) + // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1) + // -> + // %BoolSrc:sgpr(s32) = G_AND %TruncS32Src:sgpr(s32), 1 + // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %BoolSrc:sgpr(s32) + if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) { + auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC); + assert(Trunc && MRI.getType(TruncS32Src) == S32 && + "sgpr S1 must be result of G_TRUNC of sgpr S32"); B.setInstr(MI); + // Ensure that truncated bits in BoolSrc are 0. + auto One = B.buildConstant({SgprRB, S32}, 1); + auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One); + B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc}); + eraseInstr(MI, MRI); + } +} - if (DstTy == S32 && TruncSrcTy == S64) { - auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc); - MRI.replaceRegWith(Dst, Unmerge.getReg(0)); - cleanUpAfterCombine(MI, Trunc); - return; - } +void AMDGPURegBankLegalizeCombiner::tryCombineS1AnyExt(MachineInstr &MI) { + // %Src:sgpr(S1) = G_TRUNC %TruncSrc + // %Dst = G_ANYEXT %Src:sgpr(S1) + // -> + // %Dst = G_... %TruncSrc + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + if (MRI.getType(Src) != S1) + return; + + auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC); + if (!Trunc) + return; + + LLT DstTy = MRI.getType(Dst); + LLT TruncSrcTy = MRI.getType(TruncSrc); + + if (DstTy == TruncSrcTy) { + MRI.replaceRegWith(Dst, TruncSrc); + eraseInstr(MI, MRI); + return; + } - if (DstTy == S64 && TruncSrcTy == S32) { - B.buildMergeLikeInstr(MI.getOperand(0).getReg(), - {TruncSrc, B.buildUndef({SgprRB, S32})}); - cleanUpAfterCombine(MI, Trunc); - return; - } + B.setInstr(MI); - if (DstTy == S32 && TruncSrcTy == S16) { - B.buildAnyExt(Dst, TruncSrc); - cleanUpAfterCombine(MI, Trunc); - return; - } + if (DstTy == S32 && TruncSrcTy == S64) { + auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc); + MRI.replaceRegWith(Dst, Unmerge.getReg(0)); + eraseInstr(MI, MRI); + return; + } - if (DstTy == S16 && TruncSrcTy == S32) { - B.buildTrunc(Dst, TruncSrc); - cleanUpAfterCombine(MI, Trunc); - return; - } + if (DstTy == S64 && TruncSrcTy == S32) { + B.buildMergeLikeInstr(MI.getOperand(0).getReg(), + {TruncSrc, B.buildUndef({SgprRB, S32})}); + eraseInstr(MI, MRI); + return; + } - llvm_unreachable("missing anyext + trunc combine"); + if (DstTy == S32 && TruncSrcTy == S16) { + B.buildAnyExt(Dst, TruncSrc); + eraseInstr(MI, MRI); + return; } -}; + + if (DstTy == S16 && TruncSrcTy == S32) { + B.buildTrunc(Dst, TruncSrc); + eraseInstr(MI, MRI); + return; + } + + llvm_unreachable("missing anyext + trunc combine"); +} // Search through MRI for virtual registers with sgpr register bank and S1 LLT. [[maybe_unused]] static Register getAnySgprS1(const MachineRegisterInfo &MRI) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 411159c..f471881 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -33,7 +33,7 @@ RegBankLegalizeHelper::RegBankLegalizeHelper( MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules) : ST(B.getMF().getSubtarget<GCNSubtarget>()), B(B), MRI(*B.getMRI()), - MUI(MUI), RBI(RBI), RBLRules(RBLRules), + MUI(MUI), RBI(RBI), RBLRules(RBLRules), IsWave32(ST.isWave32()), SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {} @@ -56,6 +56,224 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { lower(MI, Mapping, WaterfallSgprs); } +bool RegBankLegalizeHelper::executeInWaterfallLoop( + MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range, + SmallSet<Register, 4> &SGPROperandRegs) { + // Track use registers which have already been expanded with a readfirstlane + // sequence. This may have multiple uses if moving a sequence. + DenseMap<Register, Register> WaterfalledRegMap; + + MachineBasicBlock &MBB = B.getMBB(); + MachineFunction &MF = B.getMF(); + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); + unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg; + if (IsWave32) { + MovExecOpc = AMDGPU::S_MOV_B32; + MovExecTermOpc = AMDGPU::S_MOV_B32_term; + XorTermOpc = AMDGPU::S_XOR_B32_term; + AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32; + ExecReg = AMDGPU::EXEC_LO; + } else { + MovExecOpc = AMDGPU::S_MOV_B64; + MovExecTermOpc = AMDGPU::S_MOV_B64_term; + XorTermOpc = AMDGPU::S_XOR_B64_term; + AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64; + ExecReg = AMDGPU::EXEC; + } + +#ifndef NDEBUG + const int OrigRangeSize = std::distance(Range.begin(), Range.end()); +#endif + + MachineRegisterInfo &MRI = *B.getMRI(); + Register SaveExecReg = MRI.createVirtualRegister(WaveRC); + Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); + + // Don't bother using generic instructions/registers for the exec mask. + B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg); + + Register SavedExec = MRI.createVirtualRegister(WaveRC); + + // To insert the loop we need to split the block. Move everything before + // this point to a new block, and insert a new empty block before this + // instruction. + MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + MF.insert(MBBI, LoopBB); + MF.insert(MBBI, BodyBB); + MF.insert(MBBI, RestoreExecBB); + MF.insert(MBBI, RemainderBB); + + LoopBB->addSuccessor(BodyBB); + BodyBB->addSuccessor(RestoreExecBB); + BodyBB->addSuccessor(LoopBB); + + // Move the rest of the block into a new block. + RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); + RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); + + MBB.addSuccessor(LoopBB); + RestoreExecBB->addSuccessor(RemainderBB); + + B.setInsertPt(*LoopBB, LoopBB->end()); + + // +-MBB:------------+ + // | ... | + // | %0 = G_INST_1 | + // | %Dst = MI %Vgpr | + // | %1 = G_INST_2 | + // | ... | + // +-----------------+ + // -> + // +-MBB-------------------------------+ + // | ... | + // | %0 = G_INST_1 | + // | %SaveExecReg = S_MOV_B32 $exec_lo | + // +----------------|------------------+ + // | /------------------------------| + // V V | + // +-LoopBB---------------------------------------------------------------+ | + // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | | + // | instead of executing for each lane, see if other lanes had | | + // | same value for %Vgpr and execute for them also. | | + // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | | + // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | | + // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | | + // | exec is active for lanes with the same "CurrentLane value" in Vgpr | | + // +----------------|-----------------------------------------------------+ | + // V | + // +-BodyBB------------------------------------------------------------+ | + // | %Dst = MI %CurrentLaneReg:sgpr(s32) | | + // | executed only for active lanes and written to Dst | | + // | $exec = S_XOR_B32 $exec, %SavedExec | | + // | set active lanes to 0 in SavedExec, lanes that did not write to | | + // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | | + // | SI_WATERFALL_LOOP LoopBB |-----| + // +----------------|--------------------------------------------------+ + // V + // +-RestoreExecBB--------------------------+ + // | $exec_lo = S_MOV_B32_term %SaveExecReg | + // +----------------|-----------------------+ + // V + // +-RemainderBB:----------------------+ + // | %1 = G_INST_2 | + // | ... | + // +---------------------------------- + + + // Move the instruction into the loop body. Note we moved everything after + // Range.end() already into a new block, so Range.end() is no longer valid. + BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end()); + + // Figure out the iterator range after splicing the instructions. + MachineBasicBlock::iterator NewBegin = Range.begin()->getIterator(); + auto NewEnd = BodyBB->end(); + assert(std::distance(NewBegin, NewEnd) == OrigRangeSize); + + B.setMBB(*LoopBB); + Register CondReg; + + for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { + for (MachineOperand &Op : MI.all_uses()) { + Register OldReg = Op.getReg(); + if (!SGPROperandRegs.count(OldReg)) + continue; + + // See if we already processed this register in another instruction in + // the sequence. + auto OldVal = WaterfalledRegMap.find(OldReg); + if (OldVal != WaterfalledRegMap.end()) { + Op.setReg(OldVal->second); + continue; + } + + Register OpReg = Op.getReg(); + LLT OpTy = MRI.getType(OpReg); + + // TODO: support for agpr + assert(MRI.getRegBank(OpReg) == VgprRB); + Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy}); + buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI); + + // Build the comparison(s), CurrentLaneReg == OpReg. + unsigned OpSize = OpTy.getSizeInBits(); + unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32; + LLT PartTy = LLT::scalar(PartSize); + unsigned NumParts = OpSize / PartSize; + SmallVector<Register, 8> OpParts; + SmallVector<Register, 8> CurrentLaneParts; + + if (NumParts == 1) { + OpParts.push_back(OpReg); + CurrentLaneParts.push_back(CurrentLaneReg); + } else { + auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg); + auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg); + for (unsigned i = 0; i < NumParts; ++i) { + OpParts.push_back(UnmergeOp.getReg(i)); + CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i)); + } + } + + for (unsigned i = 0; i < NumParts; ++i) { + Register CmpReg = MRI.createVirtualRegister(VccRB_S1); + B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]); + + if (!CondReg) + CondReg = CmpReg; + else + CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0); + } + + Op.setReg(CurrentLaneReg); + + // Make sure we don't re-process this register again. + WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg())); + } + } + + // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection. + Register CondRegLM = + MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)}); + B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg); + + // Update EXEC, save the original EXEC value to SavedExec. + B.buildInstr(AndSaveExecOpc) + .addDef(SavedExec) + .addReg(CondRegLM, RegState::Kill); + MRI.setSimpleHint(SavedExec, CondRegLM); + + B.setInsertPt(*BodyBB, BodyBB->end()); + + // Update EXEC, switch all done bits to 0 and all todo bits to 1. + B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec); + + // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use + // s_cbranch_scc0? + + // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. + B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB); + + // Save the EXEC mask before the loop. + B.setInsertPt(MBB, MBB.end()); + B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg); + + // Restore the EXEC mask after the loop. + B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin()); + B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg); + + // Set the insert point after the original instruction, so any new + // instructions will be in the remainder. + B.setInsertPt(*RemainderBB, RemainderBB->begin()); + + return true; +} + void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, ArrayRef<LLT> LLTBreakdown, LLT MergeTy) { MachineFunction &MF = B.getMF(); @@ -391,7 +609,7 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, switch (Mapping.LoweringMethod) { case DoNotLower: - return; + break; case VccExtToSel: return lowerVccExtToSel(MI); case UniExtToSel: { @@ -527,7 +745,10 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, } } - // TODO: executeInWaterfallLoop(... WaterfallSgprs) + if (!WaterfallSgprs.empty()) { + MachineBasicBlock::iterator I = MI.getIterator(); + executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs); + } } LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { @@ -539,6 +760,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { case Vgpr16: return LLT::scalar(16); case Sgpr32: + case Sgpr32_WF: case Sgpr32Trunc: case Sgpr32AExt: case Sgpr32AExtBoolInReg: @@ -577,6 +799,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { case VgprV2S32: return LLT::fixed_vector(2, 32); case SgprV4S32: + case SgprV4S32_WF: case VgprV4S32: case UniInVgprV4S32: return LLT::fixed_vector(4, 32); @@ -650,6 +873,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { return VccRB; case Sgpr16: case Sgpr32: + case Sgpr32_WF: case Sgpr64: case Sgpr128: case SgprP1: @@ -662,6 +886,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case SgprV2S16: case SgprV2S32: case SgprV4S32: + case SgprV4S32_WF: case SgprB32: case SgprB64: case SgprB96: @@ -923,6 +1148,14 @@ void RegBankLegalizeHelper::applyMappingSrc( } break; } + // sgpr waterfall, scalars and vectors + case Sgpr32_WF: + case SgprV4S32_WF: { + assert(Ty == getTyFromID(MethodIDs[i])); + if (RB != SgprRB) + SgprWaterfallOperandRegs.insert(Reg); + break; + } // sgpr and vgpr scalars with extend case Sgpr32AExt: { // Note: this ext allows S1, and it is meant to be combined away. diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h index 08cc7d4..db965d8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h @@ -32,6 +32,7 @@ class RegBankLegalizeHelper { const MachineUniformityInfo &MUI; const RegisterBankInfo &RBI; const RegBankLegalizeRules &RBLRules; + const bool IsWave32; const RegisterBank *SgprRB; const RegisterBank *VgprRB; const RegisterBank *VccRB; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index a60855c..5a6ad40 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -529,7 +529,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForGOpcs({G_ICMP}) .Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}}) - .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}}); + .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}}) + .Any({{DivS1, _, S64}, {{Vcc}, {None, Vgpr64, Vgpr64}}}); addRulesForGOpcs({G_FCMP}) .Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}}) @@ -666,11 +667,15 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, // clang-format off addRulesForGOpcs({G_LOAD}) .Any({{DivB32, DivP0}, {{VgprB32}, {VgprP0}}}) + .Any({{DivB32, UniP0}, {{VgprB32}, {VgprP0}}}) .Any({{DivB32, DivP1}, {{VgprB32}, {VgprP1}}}) .Any({{{UniB256, UniP1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}}) .Any({{{UniB512, UniP1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}}) .Any({{{UniB32, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}}) + .Any({{{UniB64, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP1}}}) + .Any({{{UniB96, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP1}}}) + .Any({{{UniB128, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP1}}}) .Any({{{UniB256, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP1}, SplitLoad}}) .Any({{{UniB512, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP1}, SplitLoad}}) @@ -684,6 +689,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{{UniB96, UniP4}, isAlign16 && isUL}, {{SgprB96}, {SgprP4}, WidenLoad}}, !hasUnalignedLoads) .Any({{{UniB96, UniP4}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP4}, SplitLoad}}, !hasUnalignedLoads) .Any({{{UniB96, UniP4}, isAlign4 && isUL}, {{SgprB96}, {SgprP4}}}, hasUnalignedLoads) + .Any({{{UniB128, UniP4}, isAlign4 && isUL}, {{SgprB128}, {SgprP4}}}) .Any({{{UniB256, UniP4}, isAlign4 && isUL}, {{SgprB256}, {SgprP4}}}) .Any({{{UniB512, UniP4}, isAlign4 && isUL}, {{SgprB512}, {SgprP4}}}) .Any({{{UniB32, UniP4}, !isNaturalAlignedSmall || !isUL}, {{UniInVgprB32}, {VgprP4}}}, hasSMRDSmall) // i8 and i16 load @@ -698,11 +704,15 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{{UniB32, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP4}}}); // clang-format on - addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, Vector) - .Div(S32, {{Vgpr32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}) - .Uni(S32, {{UniInVgprS32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}) - .Div(V4S32, {{VgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}) - .Uni(V4S32, {{UniInVgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}); + addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, StandardB) + .Div(B32, {{VgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Uni(B32, {{UniInVgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Div(B64, {{VgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Uni(B64, {{UniInVgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Div(B96, {{VgprB96}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Uni(B96, {{UniInVgprB96}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Div(B128, {{VgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Uni(B128, {{UniInVgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}); addRulesForGOpcs({G_STORE}) .Any({{S32, P0}, {{}, {Vgpr32, VgprP0}}}) @@ -716,7 +726,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForGOpcs({G_PTR_ADD}) .Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}}) .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}}) - .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}}); + .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}}) + .Any({{UniP4}, {{SgprP4}, {SgprP4, Sgpr64}}}); addRulesForGOpcs({G_INTTOPTR}) .Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}}) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index 7243d75..1391440 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -188,7 +188,11 @@ enum RegBankLLTMappingApplyID { Sgpr32Trunc, - // Src only modifiers: waterfalls, extends + // Src only modifiers: execute in waterfall loop if divergent + Sgpr32_WF, + SgprV4S32_WF, + + // Src only modifiers: extends Sgpr32AExt, Sgpr32AExtBoolInReg, Sgpr32SExt, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index f1caf24..787db67 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2528,7 +2528,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl( // Special case for s_mul_u64. There is not a vector equivalent of // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector // multiplications. - if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) { + if (!Subtarget.hasVectorMulU64() && Opc == AMDGPU::G_MUL && + DstTy.getSizeInBits() == 64) { applyMappingSMULU64(B, OpdMapper); return; } @@ -3500,19 +3501,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl( applyMappingMAD_64_32(B, OpdMapper); return; case AMDGPU::G_PREFETCH: { - if (!Subtarget.hasPrefetch() || !Subtarget.hasSafeSmemPrefetch()) { + if (!Subtarget.hasSafeSmemPrefetch() && !Subtarget.hasVmemPrefInsts()) { MI.eraseFromParent(); return; } Register PtrReg = MI.getOperand(0).getReg(); unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID); - if (PtrBank == AMDGPU::VGPRRegBankID) { + if (PtrBank == AMDGPU::VGPRRegBankID && + (!Subtarget.hasVmemPrefInsts() || !MI.getOperand(3).getImm())) { + // Cannot do I$ prefetch with divergent pointer. MI.eraseFromParent(); return; } unsigned AS = MRI.getType(PtrReg).getAddressSpace(); - if (!AMDGPU::isFlatGlobalAddrSpace(AS) && - AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) { + if ((!AMDGPU::isFlatGlobalAddrSpace(AS) && + AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) || + (!Subtarget.hasSafeSmemPrefetch() && + (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || + !MI.getOperand(3).getImm() /* I$ prefetch */))) { MI.eraseFromParent(); return; } @@ -3973,7 +3979,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; } else { - OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); + if (MI.getOpcode() == AMDGPU::G_MUL && Subtarget.hasVectorMulU64()) + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + else + OpdsMapping[0] = + getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/); OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); @@ -5432,6 +5442,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); break; } + case Intrinsic::amdgcn_flat_prefetch: + case Intrinsic::amdgcn_global_prefetch: + return getDefaultMappingVOP(MI); default: return getInvalidInstructionMapping(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 1e44be8..6878744 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -61,6 +61,7 @@ protected: bool EnableRealTrue16Insts = false; bool HasBF16TransInsts = false; bool HasBF16ConversionInsts = false; + bool HasBF16PackedInsts = false; bool HasMadMixInsts = false; bool HasMadMacF32Insts = false; bool HasDsSrc2Insts = false; @@ -209,6 +210,8 @@ public: return HasBF16ConversionInsts; } + bool hasBF16PackedInsts() const { return HasBF16PackedInsts; } + bool hasMadMixInsts() const { return HasMadMixInsts; } diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index dc83230..421fc42 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -5139,13 +5139,45 @@ bool AMDGPUAsmParser::validateAGPRLdSt(const MCInst &Inst) const { bool AMDGPUAsmParser::validateVGPRAlign(const MCInst &Inst) const { auto FB = getFeatureBits(); + if (!FB[AMDGPU::FeatureGFX90AInsts] && !FB[AMDGPU::FeatureGFX1250Insts]) + return true; + unsigned Opc = Inst.getOpcode(); + const MCRegisterInfo *MRI = getMRI(); // DS_READ_B96_TR_B6 is the only DS instruction in GFX950, that allows // unaligned VGPR. All others only allow even aligned VGPRs. - if (!(FB[AMDGPU::FeatureGFX90AInsts]) || Opc == AMDGPU::DS_READ_B96_TR_B6_vi) + if (FB[AMDGPU::FeatureGFX90AInsts] && Opc == AMDGPU::DS_READ_B96_TR_B6_vi) return true; - const MCRegisterInfo *MRI = getMRI(); + if (FB[AMDGPU::FeatureGFX1250Insts]) { + switch (Opc) { + default: + break; + case AMDGPU::DS_LOAD_TR6_B96: + case AMDGPU::DS_LOAD_TR6_B96_gfx12: + // DS_LOAD_TR6_B96 is the only DS instruction in GFX1250, that + // allows unaligned VGPR. All others only allow even aligned VGPRs. + return true; + case AMDGPU::GLOBAL_LOAD_TR6_B96: + case AMDGPU::GLOBAL_LOAD_TR6_B96_gfx1250: { + // GLOBAL_LOAD_TR6_B96 is the only GLOBAL instruction in GFX1250, that + // allows unaligned VGPR for vdst, but other operands still only allow + // even aligned VGPRs. + int VAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); + if (VAddrIdx != -1) { + const MCOperand &Op = Inst.getOperand(VAddrIdx); + MCRegister Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0); + if ((Sub - AMDGPU::VGPR0) & 1) + return false; + } + return true; + } + case AMDGPU::GLOBAL_LOAD_TR6_B96_SADDR: + case AMDGPU::GLOBAL_LOAD_TR6_B96_SADDR_gfx1250: + return true; + } + } + const MCRegisterClass &VGPR32 = MRI->getRegClass(AMDGPU::VGPR_32RegClassID); const MCRegisterClass &AGPR32 = MRI->getRegClass(AMDGPU::AGPR_32RegClassID); for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { @@ -5292,6 +5324,12 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst, unsigned CPol = Inst.getOperand(CPolPos).getImm(); if (!isGFX1250()) { + if (CPol & CPol::SCAL) { + SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands); + StringRef CStr(S.getPointer()); + S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scale_offset")]); + Error(S, "scale_offset is not supported on this GPU"); + } if (CPol & CPol::NV) { SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands); StringRef CStr(S.getPointer()); @@ -5300,6 +5338,13 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst, } } + if ((CPol & CPol::SCAL) && !supportsScaleOffset(MII, Inst.getOpcode())) { + SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands); + StringRef CStr(S.getPointer()); + S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scale_offset")]); + Error(S, "scale_offset is not supported for this instruction"); + } + if (isGFX12Plus()) return validateTHAndScopeBits(Inst, Operands, CPol); @@ -6971,6 +7016,7 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) { ParseStatus ResTH = ParseStatus::NoMatch; ParseStatus ResScope = ParseStatus::NoMatch; ParseStatus ResNV = ParseStatus::NoMatch; + ParseStatus ResScal = ParseStatus::NoMatch; for (;;) { if (ResTH.isNoMatch()) { @@ -7009,10 +7055,22 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) { } } + if (ResScal.isNoMatch()) { + if (trySkipId("scale_offset")) { + ResScal = ParseStatus::Success; + CPolVal |= CPol::SCAL; + continue; + } else if (trySkipId("no", "scale_offset")) { + ResScal = ParseStatus::Success; + continue; + } + } + break; } - if (ResTH.isNoMatch() && ResScope.isNoMatch() && ResNV.isNoMatch()) + if (ResTH.isNoMatch() && ResScope.isNoMatch() && ResNV.isNoMatch() && + ResScal.isNoMatch()) return ParseStatus::NoMatch; Operands.push_back(AMDGPUOperand::CreateImm(this, CPolVal, StringLoc, diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index f7f29f1..5ccf1e5 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -14,7 +14,7 @@ let WantsRoot = true in { def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>; def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>; def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [], -10>; - def ScratchSVAddr : ComplexPattern<iPTR, 3, "SelectScratchSVAddr", [], [], -10>; + def ScratchSVAddr : ComplexPattern<iPTR, 4, "SelectScratchSVAddr", [], [], -10>; } class True16D16Table <string hiOp, string loOp> { @@ -464,6 +464,37 @@ class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = n let sve = 0; } +class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64:$vaddr), string asm = " $vaddr"> : + FLAT_Pseudo<opName, (outs), !con(addr, (ins flat_offset:$offset, CPol_0:$cpol)), asm#"$offset$cpol"> { + let has_vdst = 0; + let has_data = 0; + let mayLoad = 1; + let mayStore = 1; + let VM_CNT = 0; + let LGKM_CNT = 0; +} + +multiclass FLAT_Flat_Prefetch_Pseudo<string opName> { + def "" : FLAT_Prefetch_Pseudo<opName>, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">, + GlobalSaddrTable<1, opName> { + let OtherPredicates = [HasFlatGVSMode]; + let enabled_saddr = 1; + } +} + +multiclass FLAT_Global_Prefetch_Pseudo<string opName> { + let is_flat_global = 1, has_saddr = 1 in { + def "" : FLAT_Prefetch_Pseudo<opName, (ins VReg_64:$vaddr), " $vaddr, off">, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">, + GlobalSaddrTable<1, opName> { + let enabled_saddr = 1; + } + } +} + class FlatScratchInst <string sv_op, string mode> { string SVOp = sv_op; string Mode = mode; @@ -1162,6 +1193,16 @@ defm SCRATCH_LOAD_LDS_USHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_u defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sshort">; defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_dword">; +let SubtargetPredicate = isGFX125xOnly in { +defm FLAT_LOAD_MONITOR_B32 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b32", VGPR_32>; +defm FLAT_LOAD_MONITOR_B64 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b64", VReg_64>; +defm FLAT_LOAD_MONITOR_B128 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b128", VReg_128>; + +defm GLOBAL_LOAD_MONITOR_B32 : FLAT_Global_Load_Pseudo <"global_load_monitor_b32", VGPR_32>; +defm GLOBAL_LOAD_MONITOR_B64 : FLAT_Global_Load_Pseudo <"global_load_monitor_b64", VReg_64>; +defm GLOBAL_LOAD_MONITOR_B128 : FLAT_Global_Load_Pseudo <"global_load_monitor_b128", VReg_128>; +} // End SubtargetPredicate = isGFX125xOnly + let SubtargetPredicate = isGFX12Plus in { let Uses = [EXEC, M0] in { defm GLOBAL_LOAD_BLOCK : FLAT_Global_Load_Pseudo <"global_load_block", VReg_1024>; @@ -1218,6 +1259,11 @@ let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in "global_atomic_pk_add_f16", VGPR_32, v2f16 >; +let SubtargetPredicate = HasVmemPrefInsts in { + defm FLAT_PREFETCH_B8 : FLAT_Flat_Prefetch_Pseudo<"flat_prefetch_b8">; + defm GLOBAL_PREFETCH_B8 : FLAT_Global_Prefetch_Pseudo<"global_prefetch_b8">; +} + //===----------------------------------------------------------------------===// // Flat Patterns //===----------------------------------------------------------------------===// @@ -1443,19 +1489,19 @@ class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, >; class ScratchLoadSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset))), - (inst $vaddr, $saddr, $offset, 0) + (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))), + (inst $vaddr, $saddr, $offset, $cpol) >; class ScratchStoreSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset)), - (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $saddr, $offset) + (node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol)), + (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $saddr, $offset, $cpol) >; class ScratchLoadSVaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset), vt:$in)), - (inst $vaddr, $saddr, $offset, 0, $in) + (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol), vt:$in)), + (inst $vaddr, $saddr, $offset, $cpol, $in) >; class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < @@ -2138,6 +2184,77 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f } // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch] +def PrefetchLoc: SDNodeXForm<timm, [{ + uint32_t V = N->getZExtValue(); + V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK)) << AMDGPU::CPol::SCOPE_SHIFT; + if (!Subtarget->hasSafeCUPrefetch()) + V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe + return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32); +}]>; + +def prefetch_flat : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type), + (prefetch node:$ptr, node:$rw, node:$loc, node:$type), + [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; }]> { + let GISelPredicateCode = [{ + return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS; + }]; +} + +def prefetch_global : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type), + (prefetch node:$ptr, node:$rw, node:$loc, node:$type), + [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + (cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + !Subtarget->hasSafeSmemPrefetch()); }]> { + let GISelPredicateCode = [{ + return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::GLOBAL_ADDRESS || + ((*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS && + !Subtarget->hasSafeSmemPrefetch()); + }]; +} + +multiclass FlatPrefetchPats<string inst, SDPatternOperator prefetch_kind, SDPatternOperator rw> { + def : GCNPat < + (prefetch_kind (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), rw, (i32 timm:$loc), i32imm_one), + (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, (i32 (PrefetchLoc $loc))) + > { + let AddedComplexity = !if(!eq(rw, i32imm_zero), 0, 25); + } + + def : GCNPat < + (prefetch_kind (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), rw, (i32 timm:$loc), i32imm_one), + (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, (i32 (PrefetchLoc $loc))) + > { + let AddedComplexity = !if(!eq(rw, i32imm_zero), 11, 30); + } +} + +multiclass FlatIntrPrefetchPats<string inst, SDPatternOperator intr> { + def : GCNPat < + (intr (FlatOffset i64:$vaddr, i32:$offset), timm:$cpol), + (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, $cpol) + >; + + def : GCNPat < + (intr (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), timm:$cpol), + (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, $cpol)> { + let AddedComplexity = 11; + } +} + +let SubtargetPredicate = HasVmemPrefInsts in { + defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_zero>; + defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_zero>; + + // Patterns for forced vector prefetch with rw = 1. + defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_one>; + defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_one>; + + + // Patterns for target intrinsics + defm : FlatIntrPrefetchPats<"FLAT_PREFETCH_B8", int_amdgcn_flat_prefetch>; + defm : FlatIntrPrefetchPats<"GLOBAL_PREFETCH_B8", int_amdgcn_global_prefetch>; +} // End SubtargetPredicate = HasVmemPrefInsts + //===----------------------------------------------------------------------===// // Target //===----------------------------------------------------------------------===// @@ -2941,6 +3058,7 @@ multiclass VFLAT_Real_gfx12 <bits<8> op, string name = get_FLAT_ps<NAME>.Mnemoni let DecoderNamespace = "GFX12"; let Inst{25-24} = {ps.is_flat_global, ps.is_flat_scratch}; + let Inst{48} = cpol{CPolBit.SCAL}; // scale offset } } @@ -3170,6 +3288,7 @@ multiclass VFLAT_Real_gfx1250<bits<8> op, let DecoderNamespace = "GFX1250"; let Inst{25-24} = {ps.is_flat_global, ps.is_flat_scratch}; + let Inst{48} = cpol{CPolBit.SCAL}; // scale offset } } @@ -3208,6 +3327,17 @@ multiclass VFLAT_Real_Atomics_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME defm TENSOR_SAVE : VFLAT_Real_gfx1250<0x06e>; defm TENSOR_STOP : VFLAT_Real_gfx1250<0x06f>; +defm FLAT_PREFETCH_B8 : VFLAT_Real_AllAddr_gfx1250<0x05d>; +defm GLOBAL_PREFETCH_B8 : VFLAT_Real_AllAddr_gfx1250<0x05d>; + +defm FLAT_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>; +defm FLAT_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>; +defm FLAT_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>; + +defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>; +defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>; +defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>; + defm GLOBAL_LOAD_TR_B128_w32 : VFLAT_Real_AllAddr_gfx1250<0x057, "global_load_tr16_b128">; defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "global_load_tr8_b64">; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index c4a3be4..94886b0 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -520,8 +520,8 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineInstr *MI, IsExpiredFn IsExpired) { DenseSet<const MachineBasicBlock *> Visited; return getWaitStatesSince(IsHazard, MI->getParent(), - std::next(MI->getReverseIterator()), - 0, IsExpired, Visited); + std::next(MI->getReverseIterator()), 0, IsExpired, + Visited, SIInstrInfo::getNumWaitStates); } int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { @@ -1190,7 +1190,8 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { fixVALUPartialForwardingHazard(MI); fixVALUTransUseHazard(MI); fixVALUTransCoexecutionHazards(MI); - fixWMMAHazards(MI); + fixWMMAHazards(MI); // fall-through if co-execution is enabled. + fixWMMACoexecutionHazards(MI); fixShift64HighRegBug(MI); fixVALUMaskWriteHazard(MI); fixRequiredExportPriority(MI); @@ -1909,6 +1910,182 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { return true; } +static bool isCoexecutableVALUInst(const MachineInstr &MI) { + return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isTRANS(MI) && + !SIInstrInfo::isWMMA(MI) && !SIInstrInfo::isSWMMAC(MI); // What else? +} + +static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, + const SIInstrInfo *TII, unsigned Latency, + unsigned Category) { + assert(TII->isXDLWMMA(MI) && (Latency == 8 || Latency == 16) && + "Handle me if the xdl wmma instruction latency changes"); + + switch (Category) { + case 0: // Dense WMMA Instructions: + // WMMA_*F16, WMMA_*BF16 + // WMMA_*FP8FP8 + // WMMA_*FP8BF8 + // WMMA_*BF8FP8 + // WMMA_*BF8BF8 + // WMMA_*F8F6F4 if SRCA & SRCB != F8 + return Latency == 8 && SIInstrInfo::isWMMA(MI); + + case 1: // Dense WMMA Instructions: + // WMMA_IU8 + // WMMA_IU4 + // WMMA_*F8F6F4 if SRCA OR SRCB == F8 + return Latency == 16 && SIInstrInfo::isWMMA(MI); + + case 2: // Dense SWMMAC Instructions + // SWMMAC_*F16, SWMMAC_*BF16, + // SWMMAC_*FP8FP8 + // SWMMAC_*BF8FP8 + // SWMMAC_*FP8BF8 + // SWMMAC_*BF8BF8 + return Latency == 8 && SIInstrInfo::isSWMMAC(MI); + + case 3: // Sparse WMMA Instructions: + // SWMMAC_IU8 + // SWMMAC_IU4 + return Latency == 16 && SIInstrInfo::isSWMMAC(MI); + default: + break; + } // end switch. + + return false; +} + +bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) { + if (!AMDGPU::isGFX1250(ST)) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI)) + return false; + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + // WaitStates here is the number of V_NOPs or unrelated VALU instructions must + // be in between the first WMMA and the second instruction to cover the hazard + // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second + // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for + // numbers, which depends on the category of the first WMMA. + const int WMMAWaitStates[] = {5, 9, 3, 5}; + const int VALUWaitStates[] = {4, 8, 2, 4}; + unsigned Category = 0; + + auto IsWMMAHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) { + if (!TII->isXDLWMMA(I)) + return false; + + unsigned Latency = TSchedModel.computeInstrLatency(&I); + if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category)) + return false; + + Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg(); + Register A1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg(); + Register B1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg(); + + // WMMA0 wrires (D0), WMMA1 reads (A1/B1/Idx1). + if (TRI->regsOverlap(D0, A1) || TRI->regsOverlap(D0, B1)) + return true; + + if (SIInstrInfo::isSWMMAC(*MI)) { + Register Idx1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); + if (TRI->regsOverlap(D0, Idx1)) + return true; + } + + return false; + }; + + auto IsVALUHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) { + if (!TII->isXDLWMMA(I)) + return false; + + unsigned Latency = TSchedModel.computeInstrLatency(&I); + if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category)) + return false; + + // WMMA writes, VALU reads. + Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg(); + for (const MachineOperand &ValuUse : MI->explicit_uses()) { + if (ValuUse.isReg() && TRI->regsOverlap(D0, ValuUse.getReg())) + return true; + } + + auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst); + if (!ValuDst || !ValuDst->isReg()) + return false; + Register D1 = ValuDst->getReg(); + + // WMMA writes, VALU writes. + if (TRI->regsOverlap(D0, D1)) + return true; + + // WMMA reads, VALU writes. + Register A0 = TII->getNamedOperand(I, AMDGPU::OpName::src0)->getReg(); + Register B0 = TII->getNamedOperand(I, AMDGPU::OpName::src1)->getReg(); + if (TRI->regsOverlap(A0, D1) || TRI->regsOverlap(B0, D1)) + return true; + + if (SIInstrInfo::isSWMMAC(I)) { + Register Idx0 = TII->getNamedOperand(I, AMDGPU::OpName::src2)->getReg(); + if (TRI->regsOverlap(D1, Idx0)) + return true; + } + + return false; + }; + + int Limit = 0; + auto IsExpiredFn = [&Limit](const MachineInstr &, int WaitStates) { + return WaitStates >= Limit; + }; + + auto GetWaitStatesFn = [](const MachineInstr &I) { + return SIInstrInfo::isVALU(I) ? 1 : 0; + }; + + int WaitStatesNeeded = -1; + if (TII->isXDLWMMA(*MI)) { + for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) { + Limit = WMMAWaitStates[Category]; // for IsExpiredFn. + DenseSet<const MachineBasicBlock *> Visited; + // '::getWaitStatesSince' returns the number of VALUs in between if hazard + // exists, and INT_MAX if there is no hazard. As a result, a negative + // WaitStatesNeeded here means no hazard, and we will continue to search + // for other categories. + WaitStatesNeeded = + Limit - ::getWaitStatesSince(IsWMMAHazardFn, MI->getParent(), + std::next(MI->getReverseIterator()), 0, + IsExpiredFn, Visited, GetWaitStatesFn); + } + } else { // Must be a co-executable VALU. + for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) { + Limit = VALUWaitStates[Category]; // for IsExpiredFn. + DenseSet<const MachineBasicBlock *> Visited; + // '::getWaitStatesSince' returns the number of VALUs in between if hazard + // exists, and INT_MAX if there is no hazard. As a result, a negative + // WaitStatesNeeded here means no hazard, and we will continue to search + // for other categories. + WaitStatesNeeded = + Limit - ::getWaitStatesSince(IsVALUHazardFn, MI->getParent(), + std::next(MI->getReverseIterator()), 0, + IsExpiredFn, Visited, GetWaitStatesFn); + } + } + + // WaitStatesNeeded now is the number of V_NOPs we need to insert, negative + // means not needed. + for (int i = 0; i < WaitStatesNeeded; i++) + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::V_NOP_e32)); + + return true; +} + bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) { if (!ST.hasShift64HighRegBug()) return false; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index ef6ddd8..f796eeae 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -106,6 +106,7 @@ private: bool fixVALUTransUseHazard(MachineInstr *MI); bool fixVALUTransCoexecutionHazards(MachineInstr *MI); bool fixWMMAHazards(MachineInstr *MI); + bool fixWMMACoexecutionHazards(MachineInstr *MI); bool fixShift64HighRegBug(MachineInstr *MI); bool fixVALUMaskWriteHazard(MachineInstr *MI); bool fixRequiredExportPriority(MachineInstr *MI); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 7b8f0f4..9a2bab1 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -324,7 +324,7 @@ bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { } void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, - unsigned NumRegionInstrs) const { + const SchedRegion &Region) const { // Track register pressure so the scheduler can try to decrease // pressure once register usage is above the threshold defined by // SIRegisterInfo::getRegPressureSetLimit() diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 268162b..88a269f 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -123,6 +123,7 @@ protected: bool HasSMemRealTime = false; bool HasIntClamp = false; bool HasFmaMixInsts = false; + bool HasFmaMixBF16Insts = false; bool HasMovrel = false; bool HasVGPRIndexMode = false; bool HasScalarDwordx3Loads = false; @@ -244,7 +245,9 @@ protected: bool HasVMEMtoScalarWriteHazard = false; bool HasSMEMtoVectorWriteHazard = false; bool HasInstFwdPrefetchBug = false; + bool HasVmemPrefInsts = false; bool HasSafeSmemPrefetch = false; + bool HasSafeCUPrefetch = false; bool HasVcmpxExecWARHazard = false; bool HasLdsBranchVmemWARHazard = false; bool HasNSAtoVMEMBug = false; @@ -265,8 +268,10 @@ protected: bool HasIEEEMinimumMaximumInsts = false; bool HasMinimum3Maximum3F32 = false; bool HasMinimum3Maximum3F16 = false; + bool HasMin3Max3PKF16 = false; bool HasMinimum3Maximum3PKF16 = false; bool HasLshlAddU64Inst = false; + bool HasAddSubU64Insts = false; bool HasPointSampleAccel = false; bool HasLdsBarrierArriveAtomic = false; bool HasSetPrioIncWgInst = false; @@ -460,6 +465,8 @@ public: return HasFmaMixInsts; } + bool hasFmaMixBF16Insts() const { return HasFmaMixBF16Insts; } + bool hasCARRY() const { return true; } @@ -985,8 +992,12 @@ public: bool hasPrefetch() const { return GFX12Insts; } + bool hasVmemPrefInsts() const { return HasVmemPrefInsts; } + bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; } + bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; } + // Has s_cmpk_* instructions. bool hasSCmpK() const { return getGeneration() < GFX12; } @@ -1022,7 +1033,7 @@ public: } void overrideSchedPolicy(MachineSchedPolicy &Policy, - unsigned NumRegionInstrs) const override; + const SchedRegion &Region) const override; void mirFileLoaded(MachineFunction &MF) const override; @@ -1162,8 +1173,14 @@ public: bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; } + // Scalar and global loads support scale_offset bit. + bool hasScaleOffset() const { return GFX1250Insts; } + bool hasFlatGVSMode() const { return FlatGVSMode; } + // FLAT GLOBAL VOffset is signed + bool hasSignedGVSOffset() const { return GFX1250Insts; } + bool enableSIScheduler() const { return EnableSIScheduler; } @@ -1300,7 +1317,7 @@ public: bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; } - bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; } + bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; } /// Return if operations acting on VGPR tuples require even alignment. bool needsAlignedVGPRs() const { return GFX90AInsts || GFX1250Insts; } @@ -1381,6 +1398,8 @@ public: return HasMinimum3Maximum3F16; } + bool hasMin3Max3PKF16() const { return HasMin3Max3PKF16; } + bool hasTanhInsts() const { return HasTanhInsts; } bool hasAddPC64Inst() const { return GFX1250Insts; } @@ -1494,6 +1513,18 @@ public: bool hasVOPD3() const { return GFX1250Insts; } + // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions. + bool hasAddSubU64Insts() const { return HasAddSubU64Insts; } + + // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions. + bool hasVectorMulU64() const { return GFX1250Insts; } + + // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions. + bool hasPkAddMinMaxInsts() const { return GFX1250Insts; } + + // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions. + bool hasPkMinMax3Insts() const { return GFX1250Insts; } + // \returns true if target has S_SETPRIO_INC_WG instruction. bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 197bb3f..11b072e 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -157,6 +157,9 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo, const int64_t TH = Imm & CPol::TH; const int64_t Scope = Imm & CPol::SCOPE; + if (Imm & CPol::SCAL) + O << " scale_offset"; + printTH(MI, TH, Scope, O); printScope(Scope, O); diff --git a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp index 429ce0e0..a33dbfa 100644 --- a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp +++ b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp @@ -270,5 +270,6 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { MI.eraseFromParent(); } } + finalizeBundles(MF); return false; } diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp index 2a3b42e..eff5b0a 100644 --- a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp @@ -138,7 +138,6 @@ void R600PassConfig::addPreSched2() { void R600PassConfig::addPreEmitPass() { addPass(createR600MachineCFGStructurizerPass()); addPass(createR600ExpandSpecialInstrsPass()); - addPass(&FinalizeMachineBundlesID); addPass(createR600Packetizer()); addPass(createR600ControlFlowFinalizer()); } diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index d379088..40b8bcd 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -392,16 +392,20 @@ enum CPol { TH_ATOMIC_CASCADE = 4, // Cascading vs regular // Scope - SCOPE = 0x3 << 3, // All Scope bits - SCOPE_CU = 0 << 3, - SCOPE_SE = 1 << 3, - SCOPE_DEV = 2 << 3, - SCOPE_SYS = 3 << 3, + SCOPE_SHIFT = 3, + SCOPE_MASK = 0x3, + SCOPE = SCOPE_MASK << SCOPE_SHIFT, // All Scope bits + SCOPE_CU = 0 << SCOPE_SHIFT, + SCOPE_SE = 1 << SCOPE_SHIFT, + SCOPE_DEV = 2 << SCOPE_SHIFT, + SCOPE_SYS = 3 << SCOPE_SHIFT, NV = 1 << 5, // Non-volatile bit SWZ = 1 << 6, // Swizzle bit + SCAL = 1 << 11, // Scale offset bit + ALL = TH | SCOPE, // Helper bits diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index e172c0b..e5d1eaa 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1209,18 +1209,24 @@ void SIFoldOperandsImpl::foldOperand( return; } - // A frame index will resolve to a positive constant, so it should always be - // safe to fold the addressing mode, even pre-GFX9. - UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI()); - const unsigned Opc = UseMI->getOpcode(); if (TII->isFLATScratch(*UseMI) && AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) && !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) { unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc); + unsigned CPol = + TII->getNamedOperand(*UseMI, AMDGPU::OpName::cpol)->getImm(); + if ((CPol & AMDGPU::CPol::SCAL) && + !AMDGPU::supportsScaleOffset(*TII, NewOpc)) + return; + UseMI->setDesc(TII->get(NewOpc)); } + // A frame index will resolve to a positive constant, so it should always be + // safe to fold the addressing mode, even pre-GFX9. + UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI()); + return; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index bc0fd8d..74fe2b8 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -874,13 +874,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom); - if (Subtarget->hasScalarSMulU64()) + if (Subtarget->hasVectorMulU64()) + setOperationAction(ISD::MUL, MVT::i64, Legal); + else if (Subtarget->hasScalarSMulU64()) setOperationAction(ISD::MUL, MVT::i64, Custom); if (Subtarget->hasMad64_32()) setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom); - if (Subtarget->hasPrefetch() && Subtarget->hasSafeSmemPrefetch()) + if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts()) setOperationAction(ISD::PREFETCH, MVT::Other, Custom); if (Subtarget->hasIEEEMinimumMaximumInsts()) { @@ -944,6 +946,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal); } + if (Subtarget->hasBF16PackedInsts()) { + setOperationAction( + {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA}, + MVT::v2bf16, Legal); + } + if (Subtarget->hasBF16TransInsts()) { setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal); } @@ -1053,10 +1061,12 @@ ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const { // where this is OK to use. bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const { - return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || - (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && - DestVT.getScalarType() == MVT::f32 && - SrcVT.getScalarType() == MVT::f16 && + return DestVT.getScalarType() == MVT::f32 && + ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || + (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && + SrcVT.getScalarType() == MVT::f16) || + (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() && + SrcVT.getScalarType() == MVT::bf16)) && // TODO: This probably only requires no input flushing? denormalModeIsFlushAllF32(DAG.getMachineFunction()); } @@ -1540,7 +1550,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; return true; } - case Intrinsic::amdgcn_s_prefetch_data: { + case Intrinsic::amdgcn_s_prefetch_data: + case Intrinsic::amdgcn_flat_prefetch: + case Intrinsic::amdgcn_global_prefetch: { Info.opc = ISD::INTRINSIC_VOID; Info.memVT = EVT::getIntegerVT(CI.getContext(), 8); Info.ptrVal = CI.getArgOperand(0); @@ -4432,19 +4444,28 @@ SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op, } SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const { - if (Op->isDivergent()) + if (Op->isDivergent() && + (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4))) + // Cannot do I$ prefetch with divergent pointer. return SDValue(); switch (cast<MemSDNode>(Op)->getAddressSpace()) { case AMDGPUAS::FLAT_ADDRESS: case AMDGPUAS::GLOBAL_ADDRESS: case AMDGPUAS::CONSTANT_ADDRESS: - case AMDGPUAS::CONSTANT_ADDRESS_32BIT: break; + case AMDGPUAS::CONSTANT_ADDRESS_32BIT: + if (Subtarget->hasSafeSmemPrefetch()) + break; + [[fallthrough]]; default: return SDValue(); } + // I$ prefetch + if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4)) + return SDValue(); + return Op; } @@ -5415,6 +5436,19 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineOperand &Src0 = MI.getOperand(1); MachineOperand &Src1 = MI.getOperand(2); + if (ST.hasAddSubU64Insts()) { + auto I = BuildMI(*BB, MI, DL, + TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64 + : AMDGPU::V_SUB_U64_e64), + Dest.getReg()) + .add(Src0) + .add(Src1) + .addImm(0); // clamp + TII->legalizeOperands(*I); + MI.eraseFromParent(); + return BB; + } + if (IsAdd && ST.hasLshlAddU64Inst()) { auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64), Dest.getReg()) @@ -13633,6 +13667,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, case Intrinsic::amdgcn_rcp_legacy: case Intrinsic::amdgcn_rsq_legacy: case Intrinsic::amdgcn_trig_preop: + case Intrinsic::amdgcn_tanh: case Intrinsic::amdgcn_log: case Intrinsic::amdgcn_exp2: case Intrinsic::amdgcn_sqrt: @@ -14046,7 +14081,8 @@ static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, case ISD::FMAXIMUMNUM: case AMDGPUISD::FMIN_LEGACY: case AMDGPUISD::FMAX_LEGACY: - return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()); + return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) || + (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16()); case ISD::FMINIMUM: case ISD::FMAXIMUM: return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) || diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 9faf497..dd3f2fe 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2108,8 +2108,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { assert(TII->isFLAT(MI)); - // All flat instructions use the VMEM counter. - assert(TII->usesVM_CNT(MI)); + // All flat instructions use the VMEM counter except prefetch. + if (!TII->usesVM_CNT(MI)) + return false; // If there are no memory operands then conservatively assume the flat // operation may access VMEM. @@ -2295,9 +2296,6 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); } - // A Flat memory operation must access at least one address space. - assert(FlatASCount); - // This is a flat memory operation that access both VMEM and LDS, so note it // - it will require that both the VM and LGKM be flushed to zero if it is // pending when a VM or LGKM dependency occurs. diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td index 6b41934..89d9b0d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -318,6 +318,7 @@ def CPolBit { int DLC = 2; int SCC = 4; int NV = 5; + int SCAL = 11; } class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index e2a2525..40e6871 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5482,6 +5482,19 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) { + if (CPol->getImm() & AMDGPU::CPol::SCAL) { + if (!ST.hasScaleOffset()) { + ErrInfo = "Subtarget does not support offset scaling"; + return false; + } + if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) { + ErrInfo = "Instruction does not support offset scaling"; + return false; + } + } + } + return true; } @@ -7348,6 +7361,10 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, } case AMDGPU::S_MUL_U64: + if (ST.hasVectorMulU64()) { + NewOpcode = AMDGPU::V_MUL_U64_e64; + break; + } // Split s_mul_u64 in 32-bit vector multiplications. splitScalarSMulU64(Worklist, Inst, MDT); Inst.eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index bd4995b..b0be3f86 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1662,6 +1662,8 @@ def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">; def VOP3PMadMixModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixModsExt">; def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">; +def VOP3PMadMixBF16ModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16ModsExt">; +def VOP3PMadMixBF16Mods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16Mods">; def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">; def VINTERPModsHi : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">; @@ -2866,6 +2868,7 @@ def VOP_I16_I16_I16_ARITH : VOPProfile <[i16, i16, i16, untyped], /*EnableClamp= def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>; def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>; +def VOP_BF16_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, bf16, untyped]>; def VOP_I32_I16_I16_I32 : VOPProfile <[i32, i16, i16, i32, untyped]>; def VOP_I32_I16 : VOPProfile <[i32, i16, untyped, untyped]>; @@ -2873,10 +2876,12 @@ def VOP_I16_I32 : VOPProfile <[i16, i32, untyped, untyped]>; def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>; def VOP_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, untyped]>; +def VOP_V2BF16_V2BF16_V2BF16 : VOPProfile <[v2bf16, v2bf16, v2bf16, untyped]>; def VOP_B32_F16_F16 : VOPProfile <[i32, f16, f16, untyped]>; def VOP_V2F16_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, v2f16]>; def VOP_V2I16_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, v2i16]>; +def VOP_V2BF16_V2BF16_V2BF16_V2BF16 : VOPProfile <[v2bf16, v2bf16, v2bf16, v2bf16]>; def VOP_V2I16_F32_F32 : VOPProfile <[v2i16, f32, f32, untyped]>; def VOP_V2I16_I32_I32 : VOPProfile <[v2i16, i32, i32, untyped]>; @@ -2912,8 +2917,10 @@ def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>; def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; def VOP_I16_F32_F32 : VOPProfile <[i16, f32, f32, untyped]>; def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], /*EnableClamp=*/1>; +def VOP_I64_I64_I64_ARITH : VOPProfile <[i64, i64, i64, untyped], /*EnableClamp=*/1>; def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>; def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>; +def VOP_F32_BF16_BF16_BF16 : VOPProfile <[f32, bf16, bf16, bf16]>; def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>; def VOP_V32F32_V6I32_F32 : VOPProfile <[v32f32, v6i32, f32, untyped]>; def VOP_V32F16_V6I32_F32 : VOPProfile <[v32f16, v6i32, f32, untyped]>; diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 5097ac03..b49c5a9 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -61,6 +61,7 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIDefines.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/InitializePasses.h" @@ -1078,7 +1079,9 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, if (EltOffset0 + CI.Width != EltOffset1 && EltOffset1 + Paired.Width != EltOffset0) return false; - if (CI.CPol != Paired.CPol) + // Instructions with scale_offset modifier cannot be combined unless we + // also generate a code to scale the offset and reset that bit. + if (CI.CPol != Paired.CPol || (CI.CPol & AMDGPU::CPol::SCAL)) return false; if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) { diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 3212060..0e8a420 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -704,16 +704,16 @@ void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) { DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning)); } -/// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA. -/// If this tag isn't present, or if it has no meaningful values, returns \p -/// Default. Otherwise returns all the address spaces concerned by the MMRA. -static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI, - SIAtomicAddrSpace Default) { - static constexpr StringLiteral FenceASPrefix = "amdgpu-as"; +/// Reads \p MI's MMRAs to parse the "amdgpu-synchronize-as" MMRA. +/// If this tag isn't present, or if it has no meaningful values, returns +/// \p none, otherwise returns the address spaces specified by the MD. +static std::optional<SIAtomicAddrSpace> +getSynchronizeAddrSpaceMD(const MachineInstr &MI) { + static constexpr StringLiteral FenceASPrefix = "amdgpu-synchronize-as"; auto MMRA = MMRAMetadata(MI.getMMRAMetadata()); if (!MMRA) - return Default; + return std::nullopt; SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE; for (const auto &[Prefix, Suffix] : MMRA) { @@ -726,7 +726,10 @@ static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI, diagnoseUnknownMMRAASName(MI, Suffix); } - return (Result != SIAtomicAddrSpace::NONE) ? Result : Default; + if (Result == SIAtomicAddrSpace::NONE) + return std::nullopt; + + return Result; } } // end anonymous namespace @@ -903,12 +906,19 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const { std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = *ScopeOrNone; - if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || - ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { + if (OrderingAddrSpace != SIAtomicAddrSpace::ATOMIC) { + // We currently expect refineOrderingAS to be the only place that + // can refine the AS ordered by the fence. + // If that changes, we need to review the semantics of that function + // in case it needs to preserve certain address spaces. reportUnsupported(MI, "Unsupported atomic address space"); return std::nullopt; } + auto SynchronizeAS = getSynchronizeAddrSpaceMD(*MI); + if (SynchronizeAS) + OrderingAddrSpace = *SynchronizeAS; + return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); } @@ -2687,11 +2697,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, AtomicPseudoMIs.push_back(MI); bool Changed = false; - // Refine fenced address space based on MMRAs. - // - // TODO: Should we support this MMRA on other atomic operations? - auto OrderingAddrSpace = - getFenceAddrSpaceMMRA(*MI, MOI.getOrderingAddrSpace()); + const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace(); if (MOI.isAtomic()) { const AtomicOrdering Order = MOI.getOrdering(); diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index d8b52d2..4bda51d 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -856,16 +856,18 @@ def smrd_sextloadi16 : SMRDLoadPat<sextloadi16>; def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type), (prefetch node:$ptr, node:$rw, node:$loc, node:$type), - [{ return !N->getOperand(1)->isDivergent();}]> { + [{ return !N->getOperand(1)->isDivergent() && Subtarget->hasSafeSmemPrefetch();}]> { let GISelPredicateCode = [{ - return isInstrUniform(MI); + return isInstrUniform(MI) && Subtarget->hasSafeSmemPrefetch(); }]; } def SMRDImm : ComplexPattern<iPTR, 2, "SelectSMRDImm">; def SMRDImm32 : ComplexPattern<iPTR, 2, "SelectSMRDImm32">; -def SMRDSgpr : ComplexPattern<iPTR, 2, "SelectSMRDSgpr">; -def SMRDSgprImm : ComplexPattern<iPTR, 3, "SelectSMRDSgprImm">; +let WantsRoot = true in { + def SMRDSgpr : ComplexPattern<iPTR, 3, "SelectSMRDSgpr", [], [], -3>; + def SMRDSgprImm : ComplexPattern<iPTR, 4, "SelectSMRDSgprImm", [], []>; +} def SMRDBufferImm : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">; def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">; def SMRDBufferSgprImm : ComplexPattern<iPTR, 2, "SelectSMRDBufferSgprImm">; @@ -906,15 +908,15 @@ multiclass SMRD_Patterns <string Instr, ValueType vt, PatFrag frag, let SubtargetPredicate = isNotGFX9Plus; } def : GCNPat < - (frag (SMRDSgpr i64:$sbase, i32:$soffset)), - (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, 0))> { + (frag (SMRDSgpr i64:$sbase, i32:$soffset, CPol:$cpol)), + (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, $cpol))> { let SubtargetPredicate = isGFX9Plus; } // 4. SGPR+IMM offset def : GCNPat < - (frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)), - (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, 0))> { + (frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset, CPol:$cpol)), + (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, $cpol))> { let SubtargetPredicate = isGFX9Plus; } @@ -989,15 +991,15 @@ multiclass ScalarLoadWithExtensionPat <string Instr, SDPatternOperator node, Val // 2. SGPR offset def : GCNPat < - (node (SMRDSgpr i64:$sbase, i32:$soffset)), - (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))>{ + (node (SMRDSgpr i64:$sbase, i32:$soffset, CPol:$cpol)), + (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, $cpol))>{ let SubtargetPredicate = isGFX12Plus; } // 3. SGPR+IMM offset def : GCNPat < - (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)), - (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))>{ + (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset, CPol:$cpol)), + (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, $cpol))>{ let SubtargetPredicate = isGFX12Plus; } @@ -1150,6 +1152,7 @@ multiclass SMPrefetchPat<string type, TImmLeaf cache_type> { } defm : SMPrefetchPat<"INST", i32imm_zero>; +let AddedComplexity = 12 in // Prefer scalar prefetch over global for r/o case. defm : SMPrefetchPat<"DATA", i32imm_one>; let SubtargetPredicate = isGFX12Plus in { @@ -1488,6 +1491,7 @@ class SMEM_Real_Load_gfx12<bits<6> op, string ps, string opName, OffsetMode offs let Inst{20} = cpol{CPolBit.NV}; // non-volatile let Inst{22-21} = cpol{4-3}; // scope let Inst{24-23} = cpol{1-0}; // th - only lower 2 bits are supported + let Inst{56} = cpol{CPolBit.SCAL}; // scale offset } multiclass SM_Real_Loads_gfx12<bits<6> op, string ps = NAME> { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 9c6c374..b5b3cc9 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -3228,6 +3228,25 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format, : getGfx9BufferFormatInfo(Format); } +bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode) { + uint64_t TSFlags = MII.get(Opcode).TSFlags; + + if (TSFlags & SIInstrFlags::SMRD) + return !getSMEMIsBuffer(Opcode); + if (!(TSFlags & SIInstrFlags::FLAT)) + return false; + + // Only SV and SVS modes are supported. + if (TSFlags & SIInstrFlags::FlatScratch) + return hasNamedOperand(Opcode, OpName::vaddr); + + // Only GVS mode is supported. + return hasNamedOperand(Opcode, OpName::vaddr) && + hasNamedOperand(Opcode, OpName::saddr); + + return false; +} + bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc) { for (auto OpName : {OpName::vdst, OpName::src0, OpName::src1, OpName::src2}) { int Idx = getNamedOperandIdx(OpDesc.getOpcode(), OpName); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index bde951b..c09a9d6 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1757,6 +1757,9 @@ bool isIntrinsicSourceOfDivergence(unsigned IntrID); /// \returns true if the intrinsic is uniform bool isIntrinsicAlwaysUniform(unsigned IntrID); +/// \returns true if a memory instruction supports scale_offset modifier. +bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode); + /// \returns lds block size in terms of dwords. \p /// This is used to calculate the lds size encoded for PAL metadata 3.0+ which /// must be defined in terms of bytes. diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 030a6e1..550ec9d 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -925,6 +925,17 @@ let isAdd = 1 in { defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32">; } +let isReMaterializable = 1 in { +let SubtargetPredicate = HasAddSubU64Insts, SchedRW = [Write64Bit] in { +defm V_ADD_U64 : VOP2Inst <"v_add_nc_u64", VOP_I64_I64_I64_ARITH>; +// We don't actually have something like V_SUBREV_U64 so V_SUB_U64 can't be treated as commutable. +let isCommutable = 0 in +defm V_SUB_U64 : VOP2Inst <"v_sub_nc_u64", VOP_I64_I64_I64_ARITH>; +} // End SubtargetPredicate = HasAddSubU64Insts, SchedRW = [Write64Bit] +let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDouble] in +defm V_MUL_U64 : VOP2Inst <"v_mul_u64", VOP_I64_I64_I64, DivergentBinFrag<mul>>; +} // End isReMaterializable = 1 + } // End isCommutable = 1 // These are special and do not read the exec mask. @@ -1754,6 +1765,9 @@ multiclass VOP2_Real_FULL_with_name<GFXGen Gen, bits<6> op, string opName, VOP2_Realtriple_e64_with_name<Gen, op, opName, asmName>, VOP2_Real_NO_VOP3_with_name<Gen, op, opName, asmName>; +multiclass VOP2_Real_NO_DPP<GFXGen Gen, bits<6> op> : + VOP2_Real_e32<Gen, op>, VOP2_Real_e64<Gen, op>; + multiclass VOP2_Real_NO_DPP_with_name<GFXGen Gen, bits<6> op, string opName, string asmName> { defm NAME : VOP2_Real_e32_with_name<Gen, op, opName, asmName>, @@ -1843,6 +1857,9 @@ defm V_FMAC_F64 : VOP2_Real_FULL<GFX12Gen, 0x17>; defm V_FMAMK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x23>; defm V_FMAAK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x24>; +defm V_ADD_U64 : VOP2_Real_FULL<GFX1250Gen, 0x28>; +defm V_SUB_U64 : VOP2_Real_FULL<GFX1250Gen, 0x29>; +defm V_MUL_U64 : VOP2_Real_NO_DPP<GFX1250Gen, 0x2a>; //===----------------------------------------------------------------------===// // GFX11. diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index aee2f2c..b6f9568 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1918,6 +1918,7 @@ let AssemblerPredicate = isGFX11Plus in { // These instructions differ from GFX12 variant by supporting DPP: defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250<0x252>; +defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36d>; //===----------------------------------------------------------------------===// // GFX10. diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 9feea36..c812dc9 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -35,14 +35,18 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR, bit useTiedOutput = 0> : VOP3P_Profile<P, Features, 1> { bit UseTiedOutput = useTiedOutput; + defvar Src0RC = getVCSrcForVT<P.Src0VT>.ret; + defvar Src1RC = getVCSrcForVT<P.Src1VT>.ret; + defvar Src2RC = getVCSrcForVT<P.Src2VT>.ret; + dag srcs = - (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0, - FP16InputMods:$src1_modifiers, VCSrc_f16:$src1, - FP16InputMods:$src2_modifiers, VCSrc_f16:$src2); + (ins FP16InputMods:$src0_modifiers, Src0RC:$src0, + FP16InputMods:$src1_modifiers, Src1RC:$src1, + FP16InputMods:$src2_modifiers, Src2RC:$src2); dag dpp_srcs = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, FPVRegInputMods:$src1_modifiers, VRegSrc_32:$src1, - FP16InputMods:$src2_modifiers, VCSrc_f16:$src2); + FP16InputMods:$src2_modifiers, Src2RC:$src2); // FIXME: Clamp0 misbehaves with the non-default vdst_in // following it. For now workaround this by requiring clamp @@ -144,48 +148,59 @@ def : VOP3PSatPat<usubsat, V_PK_SUB_U16>; def : VOP3PSatPat<ssubsat, V_PK_SUB_I16>; } // End SubtargetPredicate = HasVOP3PInsts -let SubtargetPredicate = HasMinimum3Maximum3PKF16, FPDPRounding = 1 in { +let isCommutable = 1, FPDPRounding = 1 in { +let SubtargetPredicate = HasMin3Max3PKF16 in { +defm V_PK_MIN3_NUM_F16 : VOP3PInst<"v_pk_min3_num_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmin3>; +defm V_PK_MAX3_NUM_F16 : VOP3PInst<"v_pk_max3_num_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmax3>; +} + +let SubtargetPredicate = HasMinimum3Maximum3PKF16 in { defm V_PK_MINIMUM3_F16 : VOP3PInst<"v_pk_minimum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfminimum3>; defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmaximum3>; } +} // End isCommutable = 1, FPDPRounding = 1 // TODO: Make sure we're doing the right thing with denormals. Note // that FMA and MAD will differ. multiclass MadFmaMixPats<SDPatternOperator fma_like, Instruction mix_inst, Instruction mixlo_inst, - Instruction mixhi_inst> { + Instruction mixhi_inst, + ValueType VT = f16, + ValueType vecVT = v2f16> { + defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods); + defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt); // At least one of the operands needs to be an fpextend of an f16 // for this to be worthwhile, so we need three patterns here. // TODO: Could we use a predicate to inspect src1/2/3 instead? def : GCNPat < - (f32 (fma_like (f32 (VOP3PMadMixModsExt f16:$src0, i32:$src0_mods)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_mods)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_mods)))), + (f32 (fma_like (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_mods)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_mods)))), (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)>; def : GCNPat < - (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)), - (f32 (VOP3PMadMixModsExt f16:$src1, i32:$src1_mods)), - (f32 (VOP3PMadMixMods f32:$src2, i32:$src2_mods)))), + (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsExtPat VT:$src1, i32:$src1_mods)), + (f32 (VOP3PMadMixModsPat f32:$src2, i32:$src2_mods)))), (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)>; def : GCNPat < - (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)), - (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_mods)), - (f32 (VOP3PMadMixModsExt f16:$src2, i32:$src2_mods)))), + (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)), + (f32 (VOP3PMadMixModsExtPat VT:$src2, i32:$src2_mods)))), (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)>; def : GCNPat < (AMDGPUclamp (build_vector - (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))), - (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))), - (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0, + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$lo_src1, i32:$lo_src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$lo_src2, i32:$lo_src2_modifiers))))), + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$hi_src0, i32:$hi_src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$hi_src1, i32:$hi_src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$hi_src2, i32:$hi_src2_modifiers))))))), + (vecVT (mixhi_inst $hi_src0_modifiers, $hi_src0, $hi_src1_modifiers, $hi_src1, $hi_src2_modifiers, $hi_src2, DSTCLAMP.ENABLE, @@ -197,8 +212,8 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, >; def : GCNPat < - (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))), + (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))), (mixlo_inst $src0_modifiers, $src0, $src1_modifiers, $src1, (i32 0), (i32 0), @@ -207,9 +222,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, >; def : GCNPat < - (build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + (build_vector VT:$elt0, (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers)))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, (i32 0), (i32 0), DSTCLAMP.NONE, @@ -217,9 +232,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, >; def : GCNPat < - (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), (mixlo_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, @@ -234,10 +249,10 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, let True16Predicate = p in { def : GCNPat < - (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.NONE, @@ -246,11 +261,11 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, def : GCNPat < (build_vector - f16:$elt0, - (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + VT:$elt0, + (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.ENABLE, @@ -261,38 +276,38 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, let True16Predicate = UseRealTrue16Insts in { def : GCNPat < - (build_vector (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), f16:$elt1), - (v2f16 (mixlo_inst $src0_modifiers, $src0, + (build_vector (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), VT:$elt1), + (vecVT (mixlo_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.NONE, - (REG_SEQUENCE VGPR_32, (f16 (IMPLICIT_DEF)), lo16, $elt1, hi16))) + (REG_SEQUENCE VGPR_32, (VT (IMPLICIT_DEF)), lo16, $elt1, hi16))) >; def : GCNPat < - (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.NONE, - (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16))) + (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16))) >; def : GCNPat < (build_vector - f16:$elt0, - (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + VT:$elt0, + (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.ENABLE, - (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16))) + (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16))) >; } // end True16Predicate } @@ -353,6 +368,67 @@ defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>; } +let SubtargetPredicate = HasFmaMixBF16Insts in { +let isCommutable = 1 in { + +let isReMaterializable = 1 in +defm V_FMA_MIX_F32_BF16 : VOP3_VOP3PInst<"v_fma_mix_f32_bf16", VOP3P_Mix_Profile<VOP_F32_BF16_BF16_BF16, VOP3_OPSEL>>; + +let FPDPRounding = 1 in { +defm V_FMA_MIXLO_BF16 : VOP3_VOP3PInst<"v_fma_mixlo_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>; + +let ClampLo = 0, ClampHi = 1 in { +defm V_FMA_MIXHI_BF16 : VOP3_VOP3PInst<"v_fma_mixhi_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>; +} +} // End FPDPRounding = 1 +} // End isCommutable = 1 + +defm : MadFmaMixPats<fma, V_FMA_MIX_F32_BF16, V_FMA_MIXLO_BF16, V_FMA_MIXHI_BF16, bf16, v2bf16>; +} // End SubtargetPredicate = HasFmaMixBF16Insts + +def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> { + let HasModifiers = 0; +} + +let isCommutable = 1, isReMaterializable = 1 in { +let SubtargetPredicate = HasPkAddMinMaxInsts in { +defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile>; +defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile>; +defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile>; +defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile>; +} +let SubtargetPredicate = HasPkMinMax3Insts in { +defm V_PK_MAX3_I16 : VOP3PInst<"v_pk_max3_i16", PK_ADD_MINMAX_Profile>; +defm V_PK_MAX3_U16 : VOP3PInst<"v_pk_max3_u16", PK_ADD_MINMAX_Profile>; +defm V_PK_MIN3_I16 : VOP3PInst<"v_pk_min3_i16", PK_ADD_MINMAX_Profile>; +defm V_PK_MIN3_U16 : VOP3PInst<"v_pk_min3_u16", PK_ADD_MINMAX_Profile>; +} +} // End isCommutable = 1, isReMaterializable = 1 + +// TODO: Extend pattern to select op_sel and op_sel_hi. +class ThreeOp_OpSelClampPats <SDPatternOperator op1, SDPatternOperator op2, + VOP3P_Pseudo inst, + ValueType vt = inst.Pfl.Src0VT, + RegisterOperand RC = getVCSrcForVT<vt>.ret> : GCNPat < + (ThreeOpFrag<op1, op2> vt:$src0, vt:$src1, vt:$src2), + (inst SRCMODS.OP_SEL_1, RC:$src0, SRCMODS.OP_SEL_1, RC:$src1, + SRCMODS.OP_SEL_1, RC:$src2, DSTCLAMP.NONE, 0) +>; + +let SubtargetPredicate = HasPkAddMinMaxInsts in { +def : ThreeOp_OpSelClampPats<add, smax, V_PK_ADD_MAX_I16>; +def : ThreeOp_OpSelClampPats<add, umax, V_PK_ADD_MAX_U16>; +def : ThreeOp_OpSelClampPats<add, smin, V_PK_ADD_MIN_I16>; +def : ThreeOp_OpSelClampPats<add, umin, V_PK_ADD_MIN_U16>; +} + +let SubtargetPredicate = HasPkMinMax3Insts in { +def : ThreeOp_OpSelClampPats<smax, smax, V_PK_MAX3_I16>; +def : ThreeOp_OpSelClampPats<umax, umax, V_PK_MAX3_U16>; +def : ThreeOp_OpSelClampPats<smin, smin, V_PK_MIN3_I16>; +def : ThreeOp_OpSelClampPats<umin, umin, V_PK_MIN3_U16>; +} + // Defines patterns that extract signed 4bit from each Idx[0]. foreach Idx = [[0,28],[4,24],[8,20],[12,16],[16,12],[20,8],[24,4]] in def ExtractSigned4bit_#Idx[0] : PatFrag<(ops node:$src), @@ -1153,6 +1229,14 @@ let isCommutable = 1, isReMaterializable = 1 in { let SubtargetPredicate = HasPkMovB32, isAsCheapAsAMove = 1 in defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>; + + let SubtargetPredicate = HasBF16PackedInsts in { + defm V_PK_ADD_BF16 : VOP3PInst<"v_pk_add_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fadd>; + defm V_PK_MUL_BF16 : VOP3PInst<"v_pk_mul_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fmul>; + defm V_PK_MIN_NUM_BF16 : VOP3PInst<"v_pk_min_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fminnum_like>; + defm V_PK_MAX_NUM_BF16 : VOP3PInst<"v_pk_max_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fmaxnum_like>; + defm V_PK_FMA_BF16 : VOP3PInst<"v_pk_fma_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fma>; + } } // End isCommutable = 1, isReMaterializable = 1 def : AMDGPUMnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">; @@ -2157,6 +2241,8 @@ multiclass VOP3P_Realtriple_gfx11_gfx12<bits<8> op> multiclass VOP3P_Real_gfx12<bits<8> op> : VOP3P_Real_Base<GFX12Gen, op>; +multiclass VOP3P_Real_gfx1250<bits<8> op> : VOP3P_Real_Base<GFX1250Gen, op>; + multiclass VOP3P_Real_with_name_gfx12<bits<8> op, string backing_ps_name = NAME, string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> : @@ -2165,6 +2251,35 @@ multiclass VOP3P_Real_with_name_gfx12<bits<8> op, defm V_PK_MIN_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1b, "V_PK_MIN_F16", "v_pk_min_num_f16">; defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_max_num_f16">; +defm V_PK_FMA_F32 : VOP3P_Real_gfx12<0x1f>; +defm V_PK_MUL_F32 : VOP3P_Real_gfx12<0x28>; +defm V_PK_ADD_F32 : VOP3P_Real_gfx12<0x29>; + +defm V_PK_ADD_MAX_I16 : VOP3P_Real_gfx1250<0x14>; +defm V_PK_ADD_MAX_U16 : VOP3P_Real_gfx1250<0x15>; +defm V_PK_ADD_MIN_I16 : VOP3P_Real_gfx1250<0x2d>; +defm V_PK_ADD_MIN_U16 : VOP3P_Real_gfx1250<0x2e>; +defm V_PK_MAX3_I16 : VOP3P_Real_gfx1250<0x2f>; +defm V_PK_MAX3_U16 : VOP3P_Real_gfx1250<0x30>; +defm V_PK_MIN3_I16 : VOP3P_Real_gfx1250<0x31>; +defm V_PK_MIN3_U16 : VOP3P_Real_gfx1250<0x32>; +defm V_PK_FMA_BF16 : VOP3P_Real_gfx1250<0x11>; +defm V_PK_ADD_BF16 : VOP3P_Real_gfx1250<0x23>; +defm V_PK_MUL_BF16 : VOP3P_Real_gfx1250<0x2a>; +defm V_PK_MIN_NUM_BF16 : VOP3P_Real_gfx1250<0x2b>; +defm V_PK_MAX_NUM_BF16 : VOP3P_Real_gfx1250<0x2c>; +defm V_PK_MINIMUM3_F16 : VOP3P_Real_gfx1250<0x36>; +defm V_PK_MAXIMUM3_F16 : VOP3P_Real_gfx1250<0x37>; +defm V_PK_MIN3_NUM_F16 : VOP3P_Real_gfx1250<0x38>; +defm V_PK_MAX3_NUM_F16 : VOP3P_Real_gfx1250<0x39>; + +defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3d>; +defm V_FMA_MIXLO_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3e>; +defm V_FMA_MIXHI_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3f>; + +let AssemblerPredicate = isGFX1250Plus in +def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16", "v_fma_mix_f32">; + defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>; defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index fd3b052..fca5dff 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -20347,6 +20347,13 @@ ARMTargetLowering::getSingleConstraintMatchWeight( return weight; } +static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT) { + if (PR == 0 || VT == MVT::Other) + return false; + return (ARM::SPRRegClass.contains(PR) && VT != MVT::f32 && VT != MVT::i32) || + (ARM::DPRRegClass.contains(PR) && VT != MVT::f64); +} + using RCPair = std::pair<unsigned, const TargetRegisterClass *>; RCPair ARMTargetLowering::getRegForInlineAsmConstraint( @@ -20420,7 +20427,10 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint( if (StringRef("{cc}").equals_insensitive(Constraint)) return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); - return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); + auto RCP = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); + if (isIncompatibleReg(RCP.first, VT)) + return {0, nullptr}; + return RCP; } /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops @@ -21731,11 +21741,16 @@ bool ARMTargetLowering::lowerInterleavedLoad( /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) -bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, +bool ARMTargetLowering::lowerInterleavedStore(Instruction *Store, + Value *LaneMask, ShuffleVectorInst *SVI, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); + auto *SI = dyn_cast<StoreInst>(Store); + if (!SI) + return false; + assert(!LaneMask && "Unexpected mask on store"); auto *VecTy = cast<FixedVectorType>(SVI->getType()); assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 9159f3d..825145d 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -685,7 +685,8 @@ class VectorType; ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const override; - bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, + bool lowerInterleavedStore(Instruction *Store, Value *Mask, + ShuffleVectorInst *SVI, unsigned Factor) const override; bool shouldInsertFencesForAtomic(const Instruction *I) const override; diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp index 5963976..6ec78d0 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp @@ -7,12 +7,10 @@ //===----------------------------------------------------------------------===// #include "AVRMCExpr.h" -#include "MCTargetDesc/AVRMCAsmInfo.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" -#include "llvm/MC/MCValue.h" namespace llvm { diff --git a/llvm/lib/Target/BPF/BPF.h b/llvm/lib/Target/BPF/BPF.h index 5d49949..7faae8b 100644 --- a/llvm/lib/Target/BPF/BPF.h +++ b/llvm/lib/Target/BPF/BPF.h @@ -22,7 +22,7 @@ class BPFTargetMachine; class InstructionSelector; class PassRegistry; -static const char *BPF_TRAP = "__bpf_trap"; +#define BPF_TRAP "__bpf_trap" ModulePass *createBPFCheckAndAdjustIR(); diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp index a0011e8..fa9007e 100644 --- a/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp +++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp @@ -16,7 +16,6 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" using namespace llvm; diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp index d9d9b36..feecfc0 100644 --- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp +++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp @@ -301,41 +301,53 @@ bool DataScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) { } bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) { - Value *PtrOperand = GEPI.getPointerOperand(); - Type *OrigGEPType = GEPI.getSourceElementType(); - Type *NewGEPType = OrigGEPType; + GEPOperator *GOp = cast<GEPOperator>(&GEPI); + Value *PtrOperand = GOp->getPointerOperand(); + Type *NewGEPType = GOp->getSourceElementType(); bool NeedsTransform = false; + // Unwrap GEP ConstantExprs to find the base operand and element type + while (auto *CE = dyn_cast<ConstantExpr>(PtrOperand)) { + if (auto *GEPCE = dyn_cast<GEPOperator>(CE)) { + GOp = GEPCE; + PtrOperand = GEPCE->getPointerOperand(); + NewGEPType = GEPCE->getSourceElementType(); + } else + break; + } + if (GlobalVariable *NewGlobal = lookupReplacementGlobal(PtrOperand)) { NewGEPType = NewGlobal->getValueType(); PtrOperand = NewGlobal; NeedsTransform = true; } else if (AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrOperand)) { Type *AllocatedType = Alloca->getAllocatedType(); - // Only transform if the allocated type is an array - if (AllocatedType != OrigGEPType && isa<ArrayType>(AllocatedType)) { + if (isa<ArrayType>(AllocatedType) && + AllocatedType != GOp->getResultElementType()) { NewGEPType = AllocatedType; NeedsTransform = true; } } - // Scalar geps should remain scalars geps. The dxil-flatten-arrays pass will - // convert these scalar geps into flattened array geps - if (!isa<ArrayType>(OrigGEPType)) - NewGEPType = OrigGEPType; - - // Note: We bail if this isn't a gep touched via alloca or global - // transformations if (!NeedsTransform) return false; - IRBuilder<> Builder(&GEPI); - SmallVector<Value *, MaxVecSize> Indices(GEPI.indices()); + // Keep scalar GEPs scalar; dxil-flatten-arrays will do flattening later + if (!isa<ArrayType>(GOp->getSourceElementType())) + NewGEPType = GOp->getSourceElementType(); + IRBuilder<> Builder(&GEPI); + SmallVector<Value *, MaxVecSize> Indices(GOp->indices()); Value *NewGEP = Builder.CreateGEP(NewGEPType, PtrOperand, Indices, - GEPI.getName(), GEPI.getNoWrapFlags()); - GEPI.replaceAllUsesWith(NewGEP); - GEPI.eraseFromParent(); + GOp->getName(), GOp->getNoWrapFlags()); + + GOp->replaceAllUsesWith(NewGEP); + + if (auto *CE = dyn_cast<ConstantExpr>(GOp)) + CE->destroyConstant(); + else if (auto *OldGEPI = dyn_cast<GetElementPtrInst>(GOp)) + OldGEPI->eraseFromParent(); + return true; } diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp index 703a9e5..c8866bf 100644 --- a/llvm/lib/Target/DirectX/DXILPrepare.cpp +++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp @@ -24,7 +24,6 @@ #include "llvm/IR/AttributeMask.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -240,11 +239,6 @@ public: for (size_t Idx = 0, End = F.arg_size(); Idx < End; ++Idx) F.removeParamAttrs(Idx, AttrMask); - // Lifetime intrinsics in LLVM 3.7 do not have the memory FnAttr - if (Intrinsic::ID IID = F.getIntrinsicID(); - IID == Intrinsic::lifetime_start || IID == Intrinsic::lifetime_end) - F.removeFnAttr(Attribute::Memory); - for (auto &BB : F) { IRBuilder<> Builder(&BB); for (auto &I : make_early_inc_range(BB)) { @@ -253,7 +247,7 @@ public: // Emtting NoOp bitcast instructions allows the ValueEnumerator to be // unmodified as it reserves instruction IDs during contruction. - if (auto *LI = dyn_cast<LoadInst>(&I)) { + if (auto LI = dyn_cast<LoadInst>(&I)) { if (Value *NoOpBitcast = maybeGenerateBitcast( Builder, PointerTypes, I, LI->getPointerOperand(), LI->getType())) { @@ -263,7 +257,7 @@ public: } continue; } - if (auto *SI = dyn_cast<StoreInst>(&I)) { + if (auto SI = dyn_cast<StoreInst>(&I)) { if (Value *NoOpBitcast = maybeGenerateBitcast( Builder, PointerTypes, I, SI->getPointerOperand(), SI->getValueOperand()->getType())) { @@ -274,7 +268,7 @@ public: } continue; } - if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { + if (auto GEP = dyn_cast<GetElementPtrInst>(&I)) { if (Value *NoOpBitcast = maybeGenerateBitcast( Builder, PointerTypes, I, GEP->getPointerOperand(), GEP->getSourceElementType())) @@ -286,17 +280,6 @@ public: CB->removeRetAttrs(AttrMask); for (size_t Idx = 0, End = CB->arg_size(); Idx < End; ++Idx) CB->removeParamAttrs(Idx, AttrMask); - // LLVM 3.7 Lifetime intrinics require an i8* pointer operand, so we - // insert a bitcast here to ensure that is the case - if (isa<LifetimeIntrinsic>(CB)) { - Value *PtrOperand = CB->getArgOperand(1); - Builder.SetInsertPoint(CB); - PointerType *PtrTy = cast<PointerType>(PtrOperand->getType()); - Value *NoOpBitcast = Builder.Insert( - CastInst::Create(Instruction::BitCast, PtrOperand, - Builder.getPtrTy(PtrTy->getAddressSpace()))); - CB->setArgOperand(1, NoOpBitcast); - } continue; } } diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index dfc8162..ebdfcaa 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/Analysis/DXILMetadataAnalysis.h" #include "llvm/BinaryFormat/DXContainer.h" +#include "llvm/Frontend/HLSL/RootSignatureMetadata.h" #include "llvm/Frontend/HLSL/RootSignatureValidations.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DiagnosticInfo.h" @@ -29,25 +30,10 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include <cstdint> -#include <optional> -#include <utility> using namespace llvm; using namespace llvm::dxil; -static bool reportError(LLVMContext *Ctx, Twine Message, - DiagnosticSeverity Severity = DS_Error) { - Ctx->diagnose(DiagnosticInfoGeneric(Message, Severity)); - return true; -} - -static bool reportValueError(LLVMContext *Ctx, Twine ParamName, - uint32_t Value) { - Ctx->diagnose(DiagnosticInfoGeneric( - "Invalid value for " + ParamName + ": " + Twine(Value), DS_Error)); - return true; -} - static std::optional<uint32_t> extractMdIntValue(MDNode *Node, unsigned int OpId) { if (auto *CI = @@ -56,453 +42,10 @@ static std::optional<uint32_t> extractMdIntValue(MDNode *Node, return std::nullopt; } -static std::optional<float> extractMdFloatValue(MDNode *Node, - unsigned int OpId) { - if (auto *CI = mdconst::dyn_extract<ConstantFP>(Node->getOperand(OpId).get())) - return CI->getValueAPF().convertToFloat(); - return std::nullopt; -} - -static std::optional<StringRef> extractMdStringValue(MDNode *Node, - unsigned int OpId) { - MDString *NodeText = dyn_cast<MDString>(Node->getOperand(OpId)); - if (NodeText == nullptr) - return std::nullopt; - return NodeText->getString(); -} - -static bool parseRootFlags(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, - MDNode *RootFlagNode) { - - if (RootFlagNode->getNumOperands() != 2) - return reportError(Ctx, "Invalid format for RootFlag Element"); - - if (std::optional<uint32_t> Val = extractMdIntValue(RootFlagNode, 1)) - RSD.Flags = *Val; - else - return reportError(Ctx, "Invalid value for RootFlag"); - - return false; -} - -static bool parseRootConstants(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, - MDNode *RootConstantNode) { - - if (RootConstantNode->getNumOperands() != 5) - return reportError(Ctx, "Invalid format for RootConstants Element"); - - dxbc::RTS0::v1::RootParameterHeader Header; - // The parameter offset doesn't matter here - we recalculate it during - // serialization Header.ParameterOffset = 0; - Header.ParameterType = - llvm::to_underlying(dxbc::RootParameterType::Constants32Bit); - - if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 1)) - Header.ShaderVisibility = *Val; - else - return reportError(Ctx, "Invalid value for ShaderVisibility"); - - dxbc::RTS0::v1::RootConstants Constants; - if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 2)) - Constants.ShaderRegister = *Val; - else - return reportError(Ctx, "Invalid value for ShaderRegister"); - - if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 3)) - Constants.RegisterSpace = *Val; - else - return reportError(Ctx, "Invalid value for RegisterSpace"); - - if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 4)) - Constants.Num32BitValues = *Val; - else - return reportError(Ctx, "Invalid value for Num32BitValues"); - - RSD.ParametersContainer.addParameter(Header, Constants); - - return false; -} - -static bool parseRootDescriptors(LLVMContext *Ctx, - mcdxbc::RootSignatureDesc &RSD, - MDNode *RootDescriptorNode, - RootSignatureElementKind ElementKind) { - assert(ElementKind == RootSignatureElementKind::SRV || - ElementKind == RootSignatureElementKind::UAV || - ElementKind == RootSignatureElementKind::CBV && - "parseRootDescriptors should only be called with RootDescriptor " - "element kind."); - if (RootDescriptorNode->getNumOperands() != 5) - return reportError(Ctx, "Invalid format for Root Descriptor Element"); - - dxbc::RTS0::v1::RootParameterHeader Header; - switch (ElementKind) { - case RootSignatureElementKind::SRV: - Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::SRV); - break; - case RootSignatureElementKind::UAV: - Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::UAV); - break; - case RootSignatureElementKind::CBV: - Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::CBV); - break; - default: - llvm_unreachable("invalid Root Descriptor kind"); - break; - } - - if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 1)) - Header.ShaderVisibility = *Val; - else - return reportError(Ctx, "Invalid value for ShaderVisibility"); - - dxbc::RTS0::v2::RootDescriptor Descriptor; - if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 2)) - Descriptor.ShaderRegister = *Val; - else - return reportError(Ctx, "Invalid value for ShaderRegister"); - - if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 3)) - Descriptor.RegisterSpace = *Val; - else - return reportError(Ctx, "Invalid value for RegisterSpace"); - - if (RSD.Version == 1) { - RSD.ParametersContainer.addParameter(Header, Descriptor); - return false; - } - assert(RSD.Version > 1); - - if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 4)) - Descriptor.Flags = *Val; - else - return reportError(Ctx, "Invalid value for Root Descriptor Flags"); - - RSD.ParametersContainer.addParameter(Header, Descriptor); - return false; -} - -static bool parseDescriptorRange(LLVMContext *Ctx, - mcdxbc::DescriptorTable &Table, - MDNode *RangeDescriptorNode) { - - if (RangeDescriptorNode->getNumOperands() != 6) - return reportError(Ctx, "Invalid format for Descriptor Range"); - - dxbc::RTS0::v2::DescriptorRange Range; - - std::optional<StringRef> ElementText = - extractMdStringValue(RangeDescriptorNode, 0); - - if (!ElementText.has_value()) - return reportError(Ctx, "Descriptor Range, first element is not a string."); - - Range.RangeType = - StringSwitch<uint32_t>(*ElementText) - .Case("CBV", llvm::to_underlying(dxbc::DescriptorRangeType::CBV)) - .Case("SRV", llvm::to_underlying(dxbc::DescriptorRangeType::SRV)) - .Case("UAV", llvm::to_underlying(dxbc::DescriptorRangeType::UAV)) - .Case("Sampler", - llvm::to_underlying(dxbc::DescriptorRangeType::Sampler)) - .Default(~0U); - - if (Range.RangeType == ~0U) - return reportError(Ctx, "Invalid Descriptor Range type: " + *ElementText); - - if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 1)) - Range.NumDescriptors = *Val; - else - return reportError(Ctx, "Invalid value for Number of Descriptor in Range"); - - if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 2)) - Range.BaseShaderRegister = *Val; - else - return reportError(Ctx, "Invalid value for BaseShaderRegister"); - - if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 3)) - Range.RegisterSpace = *Val; - else - return reportError(Ctx, "Invalid value for RegisterSpace"); - - if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 4)) - Range.OffsetInDescriptorsFromTableStart = *Val; - else - return reportError(Ctx, - "Invalid value for OffsetInDescriptorsFromTableStart"); - - if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 5)) - Range.Flags = *Val; - else - return reportError(Ctx, "Invalid value for Descriptor Range Flags"); - - Table.Ranges.push_back(Range); - return false; -} - -static bool parseDescriptorTable(LLVMContext *Ctx, - mcdxbc::RootSignatureDesc &RSD, - MDNode *DescriptorTableNode) { - const unsigned int NumOperands = DescriptorTableNode->getNumOperands(); - if (NumOperands < 2) - return reportError(Ctx, "Invalid format for Descriptor Table"); - - dxbc::RTS0::v1::RootParameterHeader Header; - if (std::optional<uint32_t> Val = extractMdIntValue(DescriptorTableNode, 1)) - Header.ShaderVisibility = *Val; - else - return reportError(Ctx, "Invalid value for ShaderVisibility"); - - mcdxbc::DescriptorTable Table; - Header.ParameterType = - llvm::to_underlying(dxbc::RootParameterType::DescriptorTable); - - for (unsigned int I = 2; I < NumOperands; I++) { - MDNode *Element = dyn_cast<MDNode>(DescriptorTableNode->getOperand(I)); - if (Element == nullptr) - return reportError(Ctx, "Missing Root Element Metadata Node."); - - if (parseDescriptorRange(Ctx, Table, Element)) - return true; - } - - RSD.ParametersContainer.addParameter(Header, Table); - return false; -} - -static bool parseStaticSampler(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, - MDNode *StaticSamplerNode) { - if (StaticSamplerNode->getNumOperands() != 14) - return reportError(Ctx, "Invalid format for Static Sampler"); - - dxbc::RTS0::v1::StaticSampler Sampler; - if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 1)) - Sampler.Filter = *Val; - else - return reportError(Ctx, "Invalid value for Filter"); - - if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 2)) - Sampler.AddressU = *Val; - else - return reportError(Ctx, "Invalid value for AddressU"); - - if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 3)) - Sampler.AddressV = *Val; - else - return reportError(Ctx, "Invalid value for AddressV"); - - if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 4)) - Sampler.AddressW = *Val; - else - return reportError(Ctx, "Invalid value for AddressW"); - - if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 5)) - Sampler.MipLODBias = *Val; - else - return reportError(Ctx, "Invalid value for MipLODBias"); - - if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 6)) - Sampler.MaxAnisotropy = *Val; - else - return reportError(Ctx, "Invalid value for MaxAnisotropy"); - - if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 7)) - Sampler.ComparisonFunc = *Val; - else - return reportError(Ctx, "Invalid value for ComparisonFunc "); - - if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 8)) - Sampler.BorderColor = *Val; - else - return reportError(Ctx, "Invalid value for ComparisonFunc "); - - if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 9)) - Sampler.MinLOD = *Val; - else - return reportError(Ctx, "Invalid value for MinLOD"); - - if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 10)) - Sampler.MaxLOD = *Val; - else - return reportError(Ctx, "Invalid value for MaxLOD"); - - if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 11)) - Sampler.ShaderRegister = *Val; - else - return reportError(Ctx, "Invalid value for ShaderRegister"); - - if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 12)) - Sampler.RegisterSpace = *Val; - else - return reportError(Ctx, "Invalid value for RegisterSpace"); - - if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 13)) - Sampler.ShaderVisibility = *Val; - else - return reportError(Ctx, "Invalid value for ShaderVisibility"); - - RSD.StaticSamplers.push_back(Sampler); - return false; -} - -static bool parseRootSignatureElement(LLVMContext *Ctx, - mcdxbc::RootSignatureDesc &RSD, - MDNode *Element) { - std::optional<StringRef> ElementText = extractMdStringValue(Element, 0); - if (!ElementText.has_value()) - return reportError(Ctx, "Invalid format for Root Element"); - - RootSignatureElementKind ElementKind = - StringSwitch<RootSignatureElementKind>(*ElementText) - .Case("RootFlags", RootSignatureElementKind::RootFlags) - .Case("RootConstants", RootSignatureElementKind::RootConstants) - .Case("RootCBV", RootSignatureElementKind::CBV) - .Case("RootSRV", RootSignatureElementKind::SRV) - .Case("RootUAV", RootSignatureElementKind::UAV) - .Case("DescriptorTable", RootSignatureElementKind::DescriptorTable) - .Case("StaticSampler", RootSignatureElementKind::StaticSamplers) - .Default(RootSignatureElementKind::Error); - - switch (ElementKind) { - - case RootSignatureElementKind::RootFlags: - return parseRootFlags(Ctx, RSD, Element); - case RootSignatureElementKind::RootConstants: - return parseRootConstants(Ctx, RSD, Element); - case RootSignatureElementKind::CBV: - case RootSignatureElementKind::SRV: - case RootSignatureElementKind::UAV: - return parseRootDescriptors(Ctx, RSD, Element, ElementKind); - case RootSignatureElementKind::DescriptorTable: - return parseDescriptorTable(Ctx, RSD, Element); - case RootSignatureElementKind::StaticSamplers: - return parseStaticSampler(Ctx, RSD, Element); - case RootSignatureElementKind::Error: - return reportError(Ctx, "Invalid Root Signature Element: " + *ElementText); - } - - llvm_unreachable("Unhandled RootSignatureElementKind enum."); -} - -static bool parse(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, - MDNode *Node) { - bool HasError = false; - - // Loop through the Root Elements of the root signature. - for (const auto &Operand : Node->operands()) { - MDNode *Element = dyn_cast<MDNode>(Operand); - if (Element == nullptr) - return reportError(Ctx, "Missing Root Element Metadata Node."); - - HasError = HasError || parseRootSignatureElement(Ctx, RSD, Element); - } - - return HasError; -} - -static bool validate(LLVMContext *Ctx, const mcdxbc::RootSignatureDesc &RSD) { - - if (!llvm::hlsl::rootsig::verifyVersion(RSD.Version)) { - return reportValueError(Ctx, "Version", RSD.Version); - } - - if (!llvm::hlsl::rootsig::verifyRootFlag(RSD.Flags)) { - return reportValueError(Ctx, "RootFlags", RSD.Flags); - } - - for (const mcdxbc::RootParameterInfo &Info : RSD.ParametersContainer) { - if (!dxbc::isValidShaderVisibility(Info.Header.ShaderVisibility)) - return reportValueError(Ctx, "ShaderVisibility", - Info.Header.ShaderVisibility); - - assert(dxbc::isValidParameterType(Info.Header.ParameterType) && - "Invalid value for ParameterType"); - - switch (Info.Header.ParameterType) { - - case llvm::to_underlying(dxbc::RootParameterType::CBV): - case llvm::to_underlying(dxbc::RootParameterType::UAV): - case llvm::to_underlying(dxbc::RootParameterType::SRV): { - const dxbc::RTS0::v2::RootDescriptor &Descriptor = - RSD.ParametersContainer.getRootDescriptor(Info.Location); - if (!llvm::hlsl::rootsig::verifyRegisterValue(Descriptor.ShaderRegister)) - return reportValueError(Ctx, "ShaderRegister", - Descriptor.ShaderRegister); - - if (!llvm::hlsl::rootsig::verifyRegisterSpace(Descriptor.RegisterSpace)) - return reportValueError(Ctx, "RegisterSpace", Descriptor.RegisterSpace); - - if (RSD.Version > 1) { - if (!llvm::hlsl::rootsig::verifyRootDescriptorFlag(RSD.Version, - Descriptor.Flags)) - return reportValueError(Ctx, "RootDescriptorFlag", Descriptor.Flags); - } - break; - } - case llvm::to_underlying(dxbc::RootParameterType::DescriptorTable): { - const mcdxbc::DescriptorTable &Table = - RSD.ParametersContainer.getDescriptorTable(Info.Location); - for (const dxbc::RTS0::v2::DescriptorRange &Range : Table) { - if (!llvm::hlsl::rootsig::verifyRangeType(Range.RangeType)) - return reportValueError(Ctx, "RangeType", Range.RangeType); - - if (!llvm::hlsl::rootsig::verifyRegisterSpace(Range.RegisterSpace)) - return reportValueError(Ctx, "RegisterSpace", Range.RegisterSpace); - - if (!llvm::hlsl::rootsig::verifyNumDescriptors(Range.NumDescriptors)) - return reportValueError(Ctx, "NumDescriptors", Range.NumDescriptors); - - if (!llvm::hlsl::rootsig::verifyDescriptorRangeFlag( - RSD.Version, Range.RangeType, Range.Flags)) - return reportValueError(Ctx, "DescriptorFlag", Range.Flags); - } - break; - } - } - } - - for (const dxbc::RTS0::v1::StaticSampler &Sampler : RSD.StaticSamplers) { - if (!llvm::hlsl::rootsig::verifySamplerFilter(Sampler.Filter)) - return reportValueError(Ctx, "Filter", Sampler.Filter); - - if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressU)) - return reportValueError(Ctx, "AddressU", Sampler.AddressU); - - if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressV)) - return reportValueError(Ctx, "AddressV", Sampler.AddressV); - - if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressW)) - return reportValueError(Ctx, "AddressW", Sampler.AddressW); - - if (!llvm::hlsl::rootsig::verifyMipLODBias(Sampler.MipLODBias)) - return reportValueError(Ctx, "MipLODBias", Sampler.MipLODBias); - - if (!llvm::hlsl::rootsig::verifyMaxAnisotropy(Sampler.MaxAnisotropy)) - return reportValueError(Ctx, "MaxAnisotropy", Sampler.MaxAnisotropy); - - if (!llvm::hlsl::rootsig::verifyComparisonFunc(Sampler.ComparisonFunc)) - return reportValueError(Ctx, "ComparisonFunc", Sampler.ComparisonFunc); - - if (!llvm::hlsl::rootsig::verifyBorderColor(Sampler.BorderColor)) - return reportValueError(Ctx, "BorderColor", Sampler.BorderColor); - - if (!llvm::hlsl::rootsig::verifyLOD(Sampler.MinLOD)) - return reportValueError(Ctx, "MinLOD", Sampler.MinLOD); - - if (!llvm::hlsl::rootsig::verifyLOD(Sampler.MaxLOD)) - return reportValueError(Ctx, "MaxLOD", Sampler.MaxLOD); - - if (!llvm::hlsl::rootsig::verifyRegisterValue(Sampler.ShaderRegister)) - return reportValueError(Ctx, "ShaderRegister", Sampler.ShaderRegister); - - if (!llvm::hlsl::rootsig::verifyRegisterSpace(Sampler.RegisterSpace)) - return reportValueError(Ctx, "RegisterSpace", Sampler.RegisterSpace); - - if (!dxbc::isValidShaderVisibility(Sampler.ShaderVisibility)) - return reportValueError(Ctx, "ShaderVisibility", - Sampler.ShaderVisibility); - } - - return false; +static bool reportError(LLVMContext *Ctx, Twine Message, + DiagnosticSeverity Severity = DS_Error) { + Ctx->diagnose(DiagnosticInfoGeneric(Message, Severity)); + return true; } static SmallDenseMap<const Function *, mcdxbc::RootSignatureDesc> @@ -584,7 +127,9 @@ analyzeModule(Module &M) { // static sampler offset is calculated when writting dxcontainer. RSD.StaticSamplersOffset = 0u; - if (parse(Ctx, RSD, RootElementListNode) || validate(Ctx, RSD)) { + hlsl::rootsig::MetadataParser MDParser(RootElementListNode); + + if (MDParser.ParseRootSignature(Ctx, RSD)) { return RSDMap; } diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index fc39b38..254b7ff 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -26,17 +26,6 @@ namespace llvm { namespace dxil { -enum class RootSignatureElementKind { - Error = 0, - RootFlags = 1, - RootConstants = 2, - SRV = 3, - UAV = 4, - CBV = 5, - DescriptorTable = 6, - StaticSamplers = 7 -}; - class RootSignatureBindingInfo { private: SmallDenseMap<const Function *, mcdxbc::RootSignatureDesc> FuncToRsMap; diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp index 46d5d71..1d79c30 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp @@ -2545,25 +2545,6 @@ void DXILBitcodeWriter::writeInstruction(const Instruction &I, unsigned InstID, Vals.clear(); } -// HLSL Change -namespace { -struct ValueNameCreator { - MallocAllocator Allocator; - SmallVector<ValueName *, 2> - ValueNames; // SmallVector N = 2 because we currently only expect this - // to hold ValueNames for Lifetime intrinsics - ~ValueNameCreator() { - for (auto *VN : ValueNames) - VN->Destroy(Allocator); - } - ValueName *create(StringRef Name, Value *V) { - ValueName *VN = ValueName::create(Name, Allocator, V); - ValueNames.push_back(VN); - return VN; - } -}; -} // anonymous namespace - // Emit names for globals/functions etc. void DXILBitcodeWriter::writeFunctionLevelValueSymbolTable( const ValueSymbolTable &VST) { @@ -2578,24 +2559,9 @@ void DXILBitcodeWriter::writeFunctionLevelValueSymbolTable( // to ensure the binary is the same no matter what values ever existed. SmallVector<const ValueName *, 16> SortedTable; - // HLSL Change - ValueNameCreator VNC; for (auto &VI : VST) { - ValueName *VN = VI.second->getValueName(); - // Clang mangles lifetime intrinsic names by appending '.p0' to the end, - // making them invalid lifetime intrinsics in LLVM 3.7. We can't - // demangle in dxil-prepare because it would result in invalid IR. - // Therefore we have to do this in the bitcode writer while writing its - // name to the symbol table. - if (const Function *Fn = dyn_cast<Function>(VI.getValue()); - Fn && Fn->isIntrinsic()) { - Intrinsic::ID IID = Fn->getIntrinsicID(); - if (IID == Intrinsic::lifetime_start || IID == Intrinsic::lifetime_end) - VN = VNC.create(Intrinsic::getBaseName(IID), VI.second); - } - SortedTable.push_back(VN); + SortedTable.push_back(VI.second->getValueName()); } - // The keys are unique, so there shouldn't be stability issues. llvm::sort(SortedTable, [](const ValueName *A, const ValueName *B) { return A->first() < B->first(); diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp index dfc79039c..1bd5dd7 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp @@ -17,6 +17,7 @@ #include "llvm/Analysis/ModuleSummaryAnalysis.h" #include "llvm/IR/Constants.h" #include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/InitializePasses.h" @@ -52,6 +53,53 @@ public: } }; +static void legalizeLifetimeIntrinsics(Module &M) { + for (Function &F : M) { + Intrinsic::ID IID = F.getIntrinsicID(); + if (IID != Intrinsic::lifetime_start && IID != Intrinsic::lifetime_end) + continue; + + // Lifetime intrinsics in LLVM 3.7 do not have the memory FnAttr + F.removeFnAttr(Attribute::Memory); + + // Lifetime intrinsics in LLVM 3.7 do not have mangled names + F.setName(Intrinsic::getBaseName(IID)); + + // LLVM 3.7 Lifetime intrinics require an i8* operand, so we insert bitcasts + // to ensure that is the case + for (auto *User : make_early_inc_range(F.users())) { + CallInst *CI = dyn_cast<CallInst>(User); + assert(CI && "Expected user of a lifetime intrinsic function to be a " + "lifetime intrinsic call"); + Value *PtrOperand = CI->getArgOperand(1); + PointerType *PtrTy = cast<PointerType>(PtrOperand->getType()); + Value *NoOpBitCast = CastInst::Create(Instruction::BitCast, PtrOperand, + PtrTy, "", CI->getIterator()); + CI->setArgOperand(1, NoOpBitCast); + } + } +} + +static void removeLifetimeIntrinsics(Module &M) { + for (Function &F : make_early_inc_range(M)) { + if (Intrinsic::ID IID = F.getIntrinsicID(); + IID != Intrinsic::lifetime_start && IID != Intrinsic::lifetime_end) + continue; + + for (User *U : make_early_inc_range(F.users())) { + LifetimeIntrinsic *LI = dyn_cast<LifetimeIntrinsic>(U); + assert(LI && "Expected user of lifetime intrinsic function to be " + "a LifetimeIntrinsic instruction"); + BitCastInst *BCI = dyn_cast<BitCastInst>(LI->getArgOperand(1)); + assert(BCI && "Expected pointer operand of LifetimeIntrinsic to be a " + "BitCastInst"); + LI->eraseFromParent(); + BCI->eraseFromParent(); + } + F.eraseFromParent(); + } +} + class EmbedDXILPass : public llvm::ModulePass { public: static char ID; // Pass identification, replacement for typeid @@ -70,8 +118,17 @@ public: // Only the output bitcode need to be DXIL triple. M.setTargetTriple(Triple("dxil-ms-dx")); + // Perform late legalization of lifetime intrinsics that would otherwise + // fail the Module Verifier if performed in an earlier pass + legalizeLifetimeIntrinsics(M); + WriteDXILToFile(M, OS); + // We no longer need lifetime intrinsics after bitcode serialization, so we + // simply remove them to keep the Module Verifier happy after our + // not-so-legal legalizations + removeLifetimeIntrinsics(M); + // Recover triple. M.setTargetTriple(OriginalTriple); diff --git a/llvm/lib/Target/Hexagon/HexagonInstrFormats.td b/llvm/lib/Target/Hexagon/HexagonInstrFormats.td index f0ca908..6050649 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrFormats.td +++ b/llvm/lib/Target/Hexagon/HexagonInstrFormats.td @@ -336,5 +336,4 @@ class InstDuplex<bits<4> iClass, string cstr = ""> : Instruction, // Instruction Classes Definitions - //===----------------------------------------------------------------------===// -include "HexagonInstrFormatsV60.td" include "HexagonInstrFormatsV65.td" diff --git a/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td b/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td deleted file mode 100644 index 86a8218..0000000 --- a/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td +++ /dev/null @@ -1,21 +0,0 @@ -//==- HexagonInstrFormatsV60.td - Hexagon Instruction Formats -*- tablegen -==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file describes the Hexagon V60 instruction classes in TableGen format. -// -//===----------------------------------------------------------------------===// - -//----------------------------------------------------------------------------// -// Instruction Classes Definitions + -//----------------------------------------------------------------------------// - -class CVI_VA_Resource<dag outs, dag ins, string asmstr, - list<dag> pattern = [], string cstr = "", - InstrItinClass itin = CVI_VA> - : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>, - OpcodeHexagon, Requires<[HasV60, UseHVX]>; diff --git a/llvm/lib/Target/Hexagon/HexagonInstrFormatsV65.td b/llvm/lib/Target/Hexagon/HexagonInstrFormatsV65.td index 246a1d3..85b826f 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrFormatsV65.td +++ b/llvm/lib/Target/Hexagon/HexagonInstrFormatsV65.td @@ -20,11 +20,6 @@ // Instruction Classes Definitions + //----------------------------------------------------------------------------// -class CVI_VA_Resource_NoOpcode<dag outs, dag ins, string asmstr, - list<dag> pattern = [], string cstr = "", - InstrItinClass itin = CVI_VA> - : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>; - class CVI_GATHER_TMP_LD_Resource_NoOpcode<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = "", InstrItinClass itin = CVI_GATHER_PSEUDO> diff --git a/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td b/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td deleted file mode 100644 index 44f39a3..0000000 --- a/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td +++ /dev/null @@ -1,414 +0,0 @@ -//===- HexagonIntrinsicsV5.td - V5 Instruction intrinsics --*- tablegen -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -def : T_PR_pat <M2_vrcmpys_s1, int_hexagon_M2_vrcmpys_s1>; -def : T_PPR_pat<M2_vrcmpys_acc_s1, int_hexagon_M2_vrcmpys_acc_s1>; -def : T_PR_pat <M2_vrcmpys_s1rp, int_hexagon_M2_vrcmpys_s1rp>; - -// Vector reduce add unsigned halfwords -def : T_PP_pat<M2_vradduh, int_hexagon_M2_vradduh>; - -def: T_RP_pat<A2_addsp, int_hexagon_A2_addsp>; -def: T_PP_pat<A2_addpsat, int_hexagon_A2_addpsat>; -def: T_PP_pat<A2_minp, int_hexagon_A2_minp>; -def: T_PP_pat<A2_minup, int_hexagon_A2_minup>; -def: T_PP_pat<A2_maxp, int_hexagon_A2_maxp>; -def: T_PP_pat<A2_maxup, int_hexagon_A2_maxup>; - -// Vector reduce multiply word by signed half (32x16) -//Rdd=vrmpyweh(Rss,Rtt)[:<<1] -def : T_PP_pat <M4_vrmpyeh_s0, int_hexagon_M4_vrmpyeh_s0>; -def : T_PP_pat <M4_vrmpyeh_s1, int_hexagon_M4_vrmpyeh_s1>; - -//Rdd=vrmpywoh(Rss,Rtt)[:<<1] -def : T_PP_pat <M4_vrmpyoh_s0, int_hexagon_M4_vrmpyoh_s0>; -def : T_PP_pat <M4_vrmpyoh_s1, int_hexagon_M4_vrmpyoh_s1>; - -//Rdd+=vrmpyweh(Rss,Rtt)[:<<1] -def : T_PPP_pat <M4_vrmpyeh_acc_s0, int_hexagon_M4_vrmpyeh_acc_s0>; -def : T_PPP_pat <M4_vrmpyeh_acc_s1, int_hexagon_M4_vrmpyeh_acc_s1>; - -//Rdd=vrmpywoh(Rss,Rtt)[:<<1] -def : T_PPP_pat <M4_vrmpyoh_acc_s0, int_hexagon_M4_vrmpyoh_acc_s0>; -def : T_PPP_pat <M4_vrmpyoh_acc_s1, int_hexagon_M4_vrmpyoh_acc_s1>; - -// Vector multiply halfwords, signed by unsigned -// Rdd=vmpyhsu(Rs,Rt)[:<<1]:sat -def : T_RR_pat <M2_vmpy2su_s0, int_hexagon_M2_vmpy2su_s0>; -def : T_RR_pat <M2_vmpy2su_s1, int_hexagon_M2_vmpy2su_s1>; - -// Rxx+=vmpyhsu(Rs,Rt)[:<<1]:sat -def : T_PRR_pat <M2_vmac2su_s0, int_hexagon_M2_vmac2su_s0>; -def : T_PRR_pat <M2_vmac2su_s1, int_hexagon_M2_vmac2su_s1>; - -// Vector polynomial multiply halfwords -// Rdd=vpmpyh(Rs,Rt) -def : T_RR_pat <M4_vpmpyh, int_hexagon_M4_vpmpyh>; -// Rxx[^]=vpmpyh(Rs,Rt) -def : T_PRR_pat <M4_vpmpyh_acc, int_hexagon_M4_vpmpyh_acc>; - -// Polynomial multiply words -// Rdd=pmpyw(Rs,Rt) -def : T_RR_pat <M4_pmpyw, int_hexagon_M4_pmpyw>; -// Rxx^=pmpyw(Rs,Rt) -def : T_PRR_pat <M4_pmpyw_acc, int_hexagon_M4_pmpyw_acc>; - -//Rxx^=asr(Rss,Rt) -def : T_PPR_pat <S2_asr_r_p_xor, int_hexagon_S2_asr_r_p_xor>; -//Rxx^=asl(Rss,Rt) -def : T_PPR_pat <S2_asl_r_p_xor, int_hexagon_S2_asl_r_p_xor>; -//Rxx^=lsr(Rss,Rt) -def : T_PPR_pat <S2_lsr_r_p_xor, int_hexagon_S2_lsr_r_p_xor>; -//Rxx^=lsl(Rss,Rt) -def : T_PPR_pat <S2_lsl_r_p_xor, int_hexagon_S2_lsl_r_p_xor>; - -// Multiply and use upper result -def : T_RR_pat <M2_mpysu_up, int_hexagon_M2_mpysu_up>; -def : T_RR_pat <M2_mpy_up_s1, int_hexagon_M2_mpy_up_s1>; -def : T_RR_pat <M2_hmmpyh_s1, int_hexagon_M2_hmmpyh_s1>; -def : T_RR_pat <M2_hmmpyl_s1, int_hexagon_M2_hmmpyl_s1>; -def : T_RR_pat <M2_mpy_up_s1_sat, int_hexagon_M2_mpy_up_s1_sat>; - -def : T_PP_pat <A2_vaddub, int_hexagon_A2_vaddb_map>; -def : T_PP_pat <A2_vsubub, int_hexagon_A2_vsubb_map>; - -// Vector reduce add unsigned halfwords -def : T_PP_pat <M2_vraddh, int_hexagon_M2_vraddh>; - -def: T_P_pat<S2_brevp, int_hexagon_S2_brevp>; -def: T_P_pat<S2_ct0p, int_hexagon_S2_ct0p>; -def: T_P_pat<S2_ct1p, int_hexagon_S2_ct1p>; - -def: T_Q_RR_pat<C4_nbitsset, int_hexagon_C4_nbitsset>; -def: T_Q_RR_pat<C4_nbitsclr, int_hexagon_C4_nbitsclr>; -def: T_Q_RI_pat<C4_nbitsclri, int_hexagon_C4_nbitsclri>; - -def : T_Q_PI_pat<A4_vcmpbeqi, int_hexagon_A4_vcmpbeqi>; -def : T_Q_PI_pat<A4_vcmpbgti, int_hexagon_A4_vcmpbgti>; -def : T_Q_PI_pat<A4_vcmpbgtui, int_hexagon_A4_vcmpbgtui>; -def : T_Q_PI_pat<A4_vcmpheqi, int_hexagon_A4_vcmpheqi>; -def : T_Q_PI_pat<A4_vcmphgti, int_hexagon_A4_vcmphgti>; -def : T_Q_PI_pat<A4_vcmphgtui, int_hexagon_A4_vcmphgtui>; -def : T_Q_PI_pat<A4_vcmpweqi, int_hexagon_A4_vcmpweqi>; -def : T_Q_PI_pat<A4_vcmpwgti, int_hexagon_A4_vcmpwgti>; -def : T_Q_PI_pat<A4_vcmpwgtui, int_hexagon_A4_vcmpwgtui>; -def : T_Q_PP_pat<A4_vcmpbeq_any, int_hexagon_A4_vcmpbeq_any>; - -def : T_Q_RR_pat<A4_cmpbeq, int_hexagon_A4_cmpbeq>; -def : T_Q_RR_pat<A4_cmpbgt, int_hexagon_A4_cmpbgt>; -def : T_Q_RR_pat<A4_cmpbgtu, int_hexagon_A4_cmpbgtu>; -def : T_Q_RR_pat<A4_cmpheq, int_hexagon_A4_cmpheq>; -def : T_Q_RR_pat<A4_cmphgt, int_hexagon_A4_cmphgt>; -def : T_Q_RR_pat<A4_cmphgtu, int_hexagon_A4_cmphgtu>; - -def : T_Q_RI_pat<A4_cmpbeqi, int_hexagon_A4_cmpbeqi>; -def : T_Q_RI_pat<A4_cmpbgti, int_hexagon_A4_cmpbgti>; -def : T_Q_RI_pat<A4_cmpbgtui, int_hexagon_A4_cmpbgtui>; - -def : T_Q_RI_pat<A4_cmpheqi, int_hexagon_A4_cmpheqi>; -def : T_Q_RI_pat<A4_cmphgti, int_hexagon_A4_cmphgti>; -def : T_Q_RI_pat<A4_cmphgtui, int_hexagon_A4_cmphgtui>; - -def : T_Q_RP_pat<A4_boundscheck, int_hexagon_A4_boundscheck>; -def : T_Q_PR_pat<A4_tlbmatch, int_hexagon_A4_tlbmatch>; - -def : T_RRR_pat <M4_mpyrr_addr, int_hexagon_M4_mpyrr_addr>; -def : T_IRR_pat <M4_mpyrr_addi, int_hexagon_M4_mpyrr_addi>; -def : T_IRI_pat <M4_mpyri_addi, int_hexagon_M4_mpyri_addi>; -def : T_RIR_pat <M4_mpyri_addr_u2, int_hexagon_M4_mpyri_addr_u2>; -def : T_RRI_pat <M4_mpyri_addr, int_hexagon_M4_mpyri_addr>; -def : T_RRR_pat <M4_mac_up_s1_sat, int_hexagon_M4_mac_up_s1_sat>; -def : T_RRR_pat <M4_nac_up_s1_sat, int_hexagon_M4_nac_up_s1_sat>; - -// Complex multiply 32x16 -def : T_PR_pat <M4_cmpyi_wh, int_hexagon_M4_cmpyi_wh>; -def : T_PR_pat <M4_cmpyr_wh, int_hexagon_M4_cmpyr_wh>; - -def : T_PR_pat <M4_cmpyi_whc, int_hexagon_M4_cmpyi_whc>; -def : T_PR_pat <M4_cmpyr_whc, int_hexagon_M4_cmpyr_whc>; - -def : T_PP_pat<A4_andnp, int_hexagon_A4_andnp>; -def : T_PP_pat<A4_ornp, int_hexagon_A4_ornp>; - -// Complex add/sub halfwords/words -def : T_PP_pat <S4_vxaddsubw, int_hexagon_S4_vxaddsubw>; -def : T_PP_pat <S4_vxsubaddw, int_hexagon_S4_vxsubaddw>; -def : T_PP_pat <S4_vxaddsubh, int_hexagon_S4_vxaddsubh>; -def : T_PP_pat <S4_vxsubaddh, int_hexagon_S4_vxsubaddh>; - -def : T_PP_pat <S4_vxaddsubhr, int_hexagon_S4_vxaddsubhr>; -def : T_PP_pat <S4_vxsubaddhr, int_hexagon_S4_vxsubaddhr>; - -// Extract bitfield -def : T_PP_pat <S4_extractp_rp, int_hexagon_S4_extractp_rp>; -def : T_RP_pat <S4_extract_rp, int_hexagon_S4_extract_rp>; -def : T_PII_pat <S4_extractp, int_hexagon_S4_extractp>; -def : T_RII_pat <S4_extract, int_hexagon_S4_extract>; - -// Vector conditional negate -// Rdd=vcnegh(Rss,Rt) -def : T_PR_pat <S2_vcnegh, int_hexagon_S2_vcnegh>; - -// Shift an immediate left by register amount -def : T_IR_pat<S4_lsli, int_hexagon_S4_lsli>; - -// Vector reduce maximum halfwords -def : T_PPR_pat <A4_vrmaxh, int_hexagon_A4_vrmaxh>; -def : T_PPR_pat <A4_vrmaxuh, int_hexagon_A4_vrmaxuh>; - -// Vector reduce maximum words -def : T_PPR_pat <A4_vrmaxw, int_hexagon_A4_vrmaxw>; -def : T_PPR_pat <A4_vrmaxuw, int_hexagon_A4_vrmaxuw>; - -// Vector reduce minimum halfwords -def : T_PPR_pat <A4_vrminh, int_hexagon_A4_vrminh>; -def : T_PPR_pat <A4_vrminuh, int_hexagon_A4_vrminuh>; - -// Vector reduce minimum words -def : T_PPR_pat <A4_vrminw, int_hexagon_A4_vrminw>; -def : T_PPR_pat <A4_vrminuw, int_hexagon_A4_vrminuw>; - -// Rotate and reduce bytes -def : Pat <(int_hexagon_S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, - u2_0ImmPred:$src3), - (S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred:$src3)>; - -// Rotate and reduce bytes with accumulation -// Rxx+=vrcrotate(Rss,Rt,#u2) -def : Pat <(int_hexagon_S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, - IntRegs:$src3, u2_0ImmPred:$src4), - (S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, - IntRegs:$src3, u2_0ImmPred:$src4)>; - -// Vector conditional negate -def : T_PPR_pat<S2_vrcnegh, int_hexagon_S2_vrcnegh>; - -// Logical xor with xor accumulation -def : T_PPP_pat<M4_xor_xacc, int_hexagon_M4_xor_xacc>; - -// ALU64 - Vector min/max byte -def : T_PP_pat <A2_vminb, int_hexagon_A2_vminb>; -def : T_PP_pat <A2_vmaxb, int_hexagon_A2_vmaxb>; - -// Shift and add/sub/and/or -def : T_IRI_pat <S4_andi_asl_ri, int_hexagon_S4_andi_asl_ri>; -def : T_IRI_pat <S4_ori_asl_ri, int_hexagon_S4_ori_asl_ri>; -def : T_IRI_pat <S4_addi_asl_ri, int_hexagon_S4_addi_asl_ri>; -def : T_IRI_pat <S4_subi_asl_ri, int_hexagon_S4_subi_asl_ri>; -def : T_IRI_pat <S4_andi_lsr_ri, int_hexagon_S4_andi_lsr_ri>; -def : T_IRI_pat <S4_ori_lsr_ri, int_hexagon_S4_ori_lsr_ri>; -def : T_IRI_pat <S4_addi_lsr_ri, int_hexagon_S4_addi_lsr_ri>; -def : T_IRI_pat <S4_subi_lsr_ri, int_hexagon_S4_subi_lsr_ri>; - -// Split bitfield -def : T_RI_pat <A4_bitspliti, int_hexagon_A4_bitspliti>; -def : T_RR_pat <A4_bitsplit, int_hexagon_A4_bitsplit>; - -def: T_RR_pat<S4_parity, int_hexagon_S4_parity>; - -def: T_Q_RI_pat<S4_ntstbit_i, int_hexagon_S4_ntstbit_i>; -def: T_Q_RR_pat<S4_ntstbit_r, int_hexagon_S4_ntstbit_r>; - -def: T_RI_pat<S4_clbaddi, int_hexagon_S4_clbaddi>; -def: T_PI_pat<S4_clbpaddi, int_hexagon_S4_clbpaddi>; -def: T_P_pat <S4_clbpnorm, int_hexagon_S4_clbpnorm>; - -//******************************************************************* -// ALU32/ALU -//******************************************************************* - -// ALU32 / ALU / Logical Operations. -def: T_RR_pat<A4_andn, int_hexagon_A4_andn>; -def: T_RR_pat<A4_orn, int_hexagon_A4_orn>; - -//******************************************************************* -// ALU32/PERM -//******************************************************************* - -// Combine Words Into Doublewords. -def: T_RI_pat<A4_combineri, int_hexagon_A4_combineri, s32_0ImmPred>; -def: T_IR_pat<A4_combineir, int_hexagon_A4_combineir, s32_0ImmPred>; - -//******************************************************************* -// ALU32/PRED -//******************************************************************* - -// Compare -def : T_Q_RI_pat<C4_cmpneqi, int_hexagon_C4_cmpneqi, s32_0ImmPred>; -def : T_Q_RI_pat<C4_cmpltei, int_hexagon_C4_cmpltei, s32_0ImmPred>; -def : T_Q_RI_pat<C4_cmplteui, int_hexagon_C4_cmplteui, u32_0ImmPred>; - -// Compare To General Register. -def: T_Q_RR_pat<C4_cmpneq, int_hexagon_C4_cmpneq>; -def: T_Q_RR_pat<C4_cmplte, int_hexagon_C4_cmplte>; -def: T_Q_RR_pat<C4_cmplteu, int_hexagon_C4_cmplteu>; - -def: T_RR_pat<A4_rcmpeq, int_hexagon_A4_rcmpeq>; -def: T_RR_pat<A4_rcmpneq, int_hexagon_A4_rcmpneq>; - -def: T_RI_pat<A4_rcmpeqi, int_hexagon_A4_rcmpeqi>; -def: T_RI_pat<A4_rcmpneqi, int_hexagon_A4_rcmpneqi>; - -//******************************************************************* -// CR -//******************************************************************* - -// CR / Logical Operations On Predicates. -def: T_Q_QQQ_pat<C4_and_and, int_hexagon_C4_and_and>; -def: T_Q_QQQ_pat<C4_and_andn, int_hexagon_C4_and_andn>; -def: T_Q_QQQ_pat<C4_and_or, int_hexagon_C4_and_or>; -def: T_Q_QQQ_pat<C4_and_orn, int_hexagon_C4_and_orn>; -def: T_Q_QQQ_pat<C4_or_and, int_hexagon_C4_or_and>; -def: T_Q_QQQ_pat<C4_or_andn, int_hexagon_C4_or_andn>; -def: T_Q_QQQ_pat<C4_or_or, int_hexagon_C4_or_or>; -def: T_Q_QQQ_pat<C4_or_orn, int_hexagon_C4_or_orn>; - -//******************************************************************* -// XTYPE/ALU -//******************************************************************* - -// Add And Accumulate. - -def : T_RRI_pat <S4_addaddi, int_hexagon_S4_addaddi>; -def : T_RIR_pat <S4_subaddi, int_hexagon_S4_subaddi>; - - -// XTYPE / ALU / Logical-logical Words. -def : T_RRR_pat <M4_or_xor, int_hexagon_M4_or_xor>; -def : T_RRR_pat <M4_and_xor, int_hexagon_M4_and_xor>; -def : T_RRR_pat <M4_or_and, int_hexagon_M4_or_and>; -def : T_RRR_pat <M4_and_and, int_hexagon_M4_and_and>; -def : T_RRR_pat <M4_xor_and, int_hexagon_M4_xor_and>; -def : T_RRR_pat <M4_or_or, int_hexagon_M4_or_or>; -def : T_RRR_pat <M4_and_or, int_hexagon_M4_and_or>; -def : T_RRR_pat <M4_xor_or, int_hexagon_M4_xor_or>; -def : T_RRR_pat <M4_or_andn, int_hexagon_M4_or_andn>; -def : T_RRR_pat <M4_and_andn, int_hexagon_M4_and_andn>; -def : T_RRR_pat <M4_xor_andn, int_hexagon_M4_xor_andn>; - -def : T_RRI_pat <S4_or_andi, int_hexagon_S4_or_andi>; -def : T_RRI_pat <S4_or_andix, int_hexagon_S4_or_andix>; -def : T_RRI_pat <S4_or_ori, int_hexagon_S4_or_ori>; - -// Modulo wrap. -def : T_RR_pat <A4_modwrapu, int_hexagon_A4_modwrapu>; - -// Arithmetic/Convergent round -// Rd=[cround|round](Rs,Rt)[:sat] -// Rd=[cround|round](Rs,#u5)[:sat] -def : T_RI_pat <A4_cround_ri, int_hexagon_A4_cround_ri>; -def : T_RR_pat <A4_cround_rr, int_hexagon_A4_cround_rr>; - -def : T_RI_pat <A4_round_ri, int_hexagon_A4_round_ri>; -def : T_RR_pat <A4_round_rr, int_hexagon_A4_round_rr>; - -def : T_RI_pat <A4_round_ri_sat, int_hexagon_A4_round_ri_sat>; -def : T_RR_pat <A4_round_rr_sat, int_hexagon_A4_round_rr_sat>; - -def : T_P_pat <A2_roundsat, int_hexagon_A2_roundsat>; - -//Rdd[+]=vrmpybsu(Rss,Rtt) -//Rdd[+]=vrmpybuu(Rss,Rtt) -def : T_PP_pat <M5_vrmpybsu, int_hexagon_M5_vrmpybsu>; -def : T_PP_pat <M5_vrmpybuu, int_hexagon_M5_vrmpybuu>; - -def : T_PP_pat <M5_vdmpybsu, int_hexagon_M5_vdmpybsu>; - -def : T_PPP_pat <M5_vrmacbsu, int_hexagon_M5_vrmacbsu>; -def : T_PPP_pat <M5_vrmacbuu, int_hexagon_M5_vrmacbuu>; -//Rxx+=vdmpybsu(Rss,Rtt):sat -def : T_PPP_pat <M5_vdmacbsu, int_hexagon_M5_vdmacbsu>; - -// Vector multiply bytes -// Rdd=vmpyb[s]u(Rs,Rt) -def : T_RR_pat <M5_vmpybsu, int_hexagon_M5_vmpybsu>; -def : T_RR_pat <M5_vmpybuu, int_hexagon_M5_vmpybuu>; - -// Rxx+=vmpyb[s]u(Rs,Rt) -def : T_PRR_pat <M5_vmacbsu, int_hexagon_M5_vmacbsu>; -def : T_PRR_pat <M5_vmacbuu, int_hexagon_M5_vmacbuu>; - -// Rd=vaddhub(Rss,Rtt):sat -def : T_PP_pat <A5_vaddhubs, int_hexagon_A5_vaddhubs>; - -def : T_FF_pat<F2_sfadd, int_hexagon_F2_sfadd>; -def : T_FF_pat<F2_sfsub, int_hexagon_F2_sfsub>; -def : T_FF_pat<F2_sfmpy, int_hexagon_F2_sfmpy>; -def : T_FF_pat<F2_sfmax, int_hexagon_F2_sfmax>; -def : T_FF_pat<F2_sfmin, int_hexagon_F2_sfmin>; - -def : T_FF_pat<F2_sffixupn, int_hexagon_F2_sffixupn>; -def : T_FF_pat<F2_sffixupd, int_hexagon_F2_sffixupd>; -def : T_F_pat <F2_sffixupr, int_hexagon_F2_sffixupr>; - -def : T_Q_QQ_pat<C4_fastcorner9, int_hexagon_C4_fastcorner9>; -def : T_Q_QQ_pat<C4_fastcorner9_not, int_hexagon_C4_fastcorner9_not>; - -def : T_P_pat <S5_popcountp, int_hexagon_S5_popcountp>; -def : T_PI_pat <S5_asrhub_sat, int_hexagon_S5_asrhub_sat>; - -def : T_PI_pat <S2_asr_i_p_rnd, int_hexagon_S2_asr_i_p_rnd>; -def : T_PI_pat <S2_asr_i_p_rnd_goodsyntax, - int_hexagon_S2_asr_i_p_rnd_goodsyntax>; - -def : T_PI_pat <S5_asrhub_rnd_sat_goodsyntax, - int_hexagon_S5_asrhub_rnd_sat_goodsyntax>; - -def : T_PI_pat <S5_vasrhrnd_goodsyntax, int_hexagon_S5_vasrhrnd_goodsyntax>; - -def : T_FFF_pat <F2_sffma, int_hexagon_F2_sffma>; -def : T_FFF_pat <F2_sffms, int_hexagon_F2_sffms>; -def : T_FFF_pat <F2_sffma_lib, int_hexagon_F2_sffma_lib>; -def : T_FFF_pat <F2_sffms_lib, int_hexagon_F2_sffms_lib>; -def : T_FFFQ_pat <F2_sffma_sc, int_hexagon_F2_sffma_sc>; - -// Compare floating-point value -def : T_Q_FF_pat <F2_sfcmpge, int_hexagon_F2_sfcmpge>; -def : T_Q_FF_pat <F2_sfcmpuo, int_hexagon_F2_sfcmpuo>; -def : T_Q_FF_pat <F2_sfcmpeq, int_hexagon_F2_sfcmpeq>; -def : T_Q_FF_pat <F2_sfcmpgt, int_hexagon_F2_sfcmpgt>; - -def : T_Q_DD_pat <F2_dfcmpeq, int_hexagon_F2_dfcmpeq>; -def : T_Q_DD_pat <F2_dfcmpgt, int_hexagon_F2_dfcmpgt>; -def : T_Q_DD_pat <F2_dfcmpge, int_hexagon_F2_dfcmpge>; -def : T_Q_DD_pat <F2_dfcmpuo, int_hexagon_F2_dfcmpuo>; - -// Create floating-point value -def : T_I_pat <F2_sfimm_p, int_hexagon_F2_sfimm_p>; -def : T_I_pat <F2_sfimm_n, int_hexagon_F2_sfimm_n>; -def : T_I_pat <F2_dfimm_p, int_hexagon_F2_dfimm_p>; -def : T_I_pat <F2_dfimm_n, int_hexagon_F2_dfimm_n>; - -def : T_Q_DI_pat <F2_dfclass, int_hexagon_F2_dfclass>; -def : T_Q_FI_pat <F2_sfclass, int_hexagon_F2_sfclass>; -def : T_F_pat <F2_conv_sf2df, int_hexagon_F2_conv_sf2df>; -def : T_D_pat <F2_conv_df2sf, int_hexagon_F2_conv_df2sf>; -def : T_R_pat <F2_conv_uw2sf, int_hexagon_F2_conv_uw2sf>; -def : T_R_pat <F2_conv_uw2df, int_hexagon_F2_conv_uw2df>; -def : T_R_pat <F2_conv_w2sf, int_hexagon_F2_conv_w2sf>; -def : T_R_pat <F2_conv_w2df, int_hexagon_F2_conv_w2df>; -def : T_P_pat <F2_conv_ud2sf, int_hexagon_F2_conv_ud2sf>; -def : T_P_pat <F2_conv_ud2df, int_hexagon_F2_conv_ud2df>; -def : T_P_pat <F2_conv_d2sf, int_hexagon_F2_conv_d2sf>; -def : T_P_pat <F2_conv_d2df, int_hexagon_F2_conv_d2df>; -def : T_F_pat <F2_conv_sf2uw, int_hexagon_F2_conv_sf2uw>; -def : T_F_pat <F2_conv_sf2w, int_hexagon_F2_conv_sf2w>; -def : T_F_pat <F2_conv_sf2ud, int_hexagon_F2_conv_sf2ud>; -def : T_F_pat <F2_conv_sf2d, int_hexagon_F2_conv_sf2d>; -def : T_D_pat <F2_conv_df2uw, int_hexagon_F2_conv_df2uw>; -def : T_D_pat <F2_conv_df2w, int_hexagon_F2_conv_df2w>; -def : T_D_pat <F2_conv_df2ud, int_hexagon_F2_conv_df2ud>; -def : T_D_pat <F2_conv_df2d, int_hexagon_F2_conv_df2d>; -def : T_F_pat <F2_conv_sf2uw_chop, int_hexagon_F2_conv_sf2uw_chop>; -def : T_F_pat <F2_conv_sf2w_chop, int_hexagon_F2_conv_sf2w_chop>; -def : T_F_pat <F2_conv_sf2ud_chop, int_hexagon_F2_conv_sf2ud_chop>; -def : T_F_pat <F2_conv_sf2d_chop, int_hexagon_F2_conv_sf2d_chop>; -def : T_D_pat <F2_conv_df2uw_chop, int_hexagon_F2_conv_df2uw_chop>; -def : T_D_pat <F2_conv_df2w_chop, int_hexagon_F2_conv_df2w_chop>; -def : T_D_pat <F2_conv_df2ud_chop, int_hexagon_F2_conv_df2ud_chop>; -def : T_D_pat <F2_conv_df2d_chop, int_hexagon_F2_conv_df2d_chop>; diff --git a/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td b/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td deleted file mode 100644 index 796979e..0000000 --- a/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td +++ /dev/null @@ -1,642 +0,0 @@ -//===- HexagonIntrinsicsV60.td - V60 instruction intrinsics -*- tablegen *-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file describes the Hexagon V60 Compiler Intrinsics in TableGen format. -// -//===----------------------------------------------------------------------===// - - -let AddedComplexity = 100 in { -def : Pat < (v16i32 (int_hexagon_V6_lo (v32i32 HvxWR:$src1))), - (v16i32 (EXTRACT_SUBREG (v32i32 HvxWR:$src1), vsub_lo)) >; - -def : Pat < (v16i32 (int_hexagon_V6_hi (v32i32 HvxWR:$src1))), - (v16i32 (EXTRACT_SUBREG (v32i32 HvxWR:$src1), vsub_hi)) >; - -def : Pat < (v32i32 (int_hexagon_V6_lo_128B (v64i32 HvxWR:$src1))), - (v32i32 (EXTRACT_SUBREG (v64i32 HvxWR:$src1), vsub_lo)) >; - -def : Pat < (v32i32 (int_hexagon_V6_hi_128B (v64i32 HvxWR:$src1))), - (v32i32 (EXTRACT_SUBREG (v64i32 HvxWR:$src1), vsub_hi)) >; -} - -def : Pat <(v64i1 (bitconvert (v16i32 HvxVR:$src1))), - (v64i1 (V6_vandvrt(v16i32 HvxVR:$src1), (A2_tfrsi 0x01010101)))>; - -def : Pat <(v64i1 (bitconvert (v32i16 HvxVR:$src1))), - (v64i1 (V6_vandvrt(v32i16 HvxVR:$src1), (A2_tfrsi 0x01010101)))>; - -def : Pat <(v64i1 (bitconvert (v64i8 HvxVR:$src1))), - (v64i1 (V6_vandvrt(v64i8 HvxVR:$src1), (A2_tfrsi 0x01010101)))>; - -def : Pat <(v16i32 (bitconvert (v64i1 HvxQR:$src1))), - (v16i32 (V6_vandqrt(v64i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>; - -def : Pat <(v32i16 (bitconvert (v64i1 HvxQR:$src1))), - (v32i16 (V6_vandqrt(v64i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>; - -def : Pat <(v64i8 (bitconvert (v64i1 HvxQR:$src1))), - (v64i8 (V6_vandqrt(v64i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>; - -def : Pat <(v128i1 (bitconvert (v32i32 HvxVR:$src1))), - (v128i1 (V6_vandvrt (v32i32 HvxVR:$src1), (A2_tfrsi 0x01010101)))>; - -def : Pat <(v128i1 (bitconvert (v64i16 HvxVR:$src1))), - (v128i1 (V6_vandvrt (v64i16 HvxVR:$src1), (A2_tfrsi 0x01010101)))>; - -def : Pat <(v128i1 (bitconvert (v128i8 HvxVR:$src1))), - (v128i1 (V6_vandvrt (v128i8 HvxVR:$src1), (A2_tfrsi 0x01010101)))>; - -def : Pat <(v32i32 (bitconvert (v128i1 HvxQR:$src1))), - (v32i32 (V6_vandqrt (v128i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>; - -def : Pat <(v64i16 (bitconvert (v128i1 HvxQR:$src1))), - (v64i16 (V6_vandqrt (v128i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>; - -def : Pat <(v128i8 (bitconvert (v128i1 HvxQR:$src1))), - (v128i8 (V6_vandqrt (v128i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>; - -let AddedComplexity = 140 in { -def : Pat <(store (v64i1 HvxQR:$src1), (i32 IntRegs:$addr)), - (V6_vS32b_ai IntRegs:$addr, 0, - (v16i32 (V6_vandqrt (v64i1 HvxQR:$src1), - (A2_tfrsi 0x01010101))))>; - -def : Pat <(v64i1 (load (i32 IntRegs:$addr))), - (v64i1 (V6_vandvrt - (v16i32 (V6_vL32b_ai IntRegs:$addr, 0)), (A2_tfrsi 0x01010101)))>; - -def : Pat <(store (v128i1 HvxQR:$src1), (i32 IntRegs:$addr)), - (V6_vS32b_ai IntRegs:$addr, 0, - (v32i32 (V6_vandqrt (v128i1 HvxQR:$src1), - (A2_tfrsi 0x01010101))))>; - -def : Pat <(v128i1 (load (i32 IntRegs:$addr))), - (v128i1 (V6_vandvrt - (v32i32 (V6_vL32b_ai IntRegs:$addr, 0)), (A2_tfrsi 0x01010101)))>; -} - -multiclass T_R_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID IntRegs:$src1), (MI IntRegs:$src1)>; - def: Pat<(!cast<Intrinsic>(IntID#"_128B") IntRegs:$src1), - (MI IntRegs:$src1)>; -} - -multiclass T_V_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxVR:$src1), - (MI HvxVR:$src1)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1), - (MI HvxVR:$src1)>; -} - -multiclass T_W_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxWR:$src1), - (MI HvxWR:$src1)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1), - (MI HvxWR:$src1)>; -} - -multiclass T_Q_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxQR:$src1), - (MI HvxQR:$src1)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1), - (MI HvxQR:$src1)>; -} - -multiclass T_WR_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxWR:$src1, IntRegs:$src2), - (MI HvxWR:$src1, IntRegs:$src2)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B")HvxWR:$src1, IntRegs:$src2), - (MI HvxWR:$src1, IntRegs:$src2)>; -} - -multiclass T_VR_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxVR:$src1, IntRegs:$src2), - (MI HvxVR:$src1, IntRegs:$src2)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B")HvxVR:$src1, IntRegs:$src2), - (MI HvxVR:$src1, IntRegs:$src2)>; -} - -multiclass T_WV_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxWR:$src1, HvxVR:$src2), - (MI HvxWR:$src1, HvxVR:$src2)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2), - (MI HvxWR:$src1, HvxVR:$src2)>; -} - -multiclass T_WW_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxWR:$src1, HvxWR:$src2), - (MI HvxWR:$src1, HvxWR:$src2)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxWR:$src2), - (MI HvxWR:$src1, HvxWR:$src2)>; -} - -multiclass T_VV_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxVR:$src1, HvxVR:$src2), - (MI HvxVR:$src1, HvxVR:$src2)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2), - (MI HvxVR:$src1, HvxVR:$src2)>; -} - -multiclass T_QR_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxQR:$src1, IntRegs:$src2), - (MI HvxQR:$src1, IntRegs:$src2)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, IntRegs:$src2), - (MI HvxQR:$src1, IntRegs:$src2)>; -} - -multiclass T_QQ_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxQR:$src1, HvxQR:$src2), - (MI HvxQR:$src1, HvxQR:$src2)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, HvxQR:$src2), - (MI HvxQR:$src1, HvxQR:$src2)>; -} - -multiclass T_WWR_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), - (MI HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxWR:$src2, - IntRegs:$src3), - (MI HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>; -} - -multiclass T_VVR_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), - (MI HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2, - IntRegs:$src3), - (MI HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>; -} - -multiclass T_WVR_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxWR:$src1, HvxVR:$src2, IntRegs:$src3), - (MI HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2, - IntRegs:$src3), - (MI HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>; -} - -multiclass T_VWR_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxVR:$src1, HvxWR:$src2, IntRegs:$src3), - (MI HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxWR:$src2, - IntRegs:$src3), - (MI HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>; -} - -multiclass T_VVV_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, HvxVR:$src3), - (MI HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2, - HvxVR:$src3), - (MI HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>; -} - -multiclass T_WVV_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), - (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2, - HvxVR:$src3), - (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>; -} - -multiclass T_QVV_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), - (MI HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, HvxVR:$src2, - HvxVR:$src3), - (MI HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>; -} - -multiclass T_VQR_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxVR:$src1, HvxQR:$src2, IntRegs:$src3), - (MI HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxQR:$src2, - IntRegs:$src3), - (MI HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>; -} - - -multiclass T_QVR_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxQR:$src1, HvxVR:$src2, IntRegs:$src3), - (MI HvxQR:$src1, HvxVR:$src2, IntRegs:$src3)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, HvxVR:$src2, - IntRegs:$src3), - (MI HvxQR:$src1, HvxVR:$src2, IntRegs:$src3)>; -} - -multiclass T_VVI_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, imm:$src3), - (MI HvxVR:$src1, HvxVR:$src2, imm:$src3)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, - HvxVR:$src2, imm:$src3), - (MI HvxVR:$src1, HvxVR:$src2, imm:$src3)>; -} - -multiclass T_WRI_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxWR:$src1, IntRegs:$src2, imm:$src3), - (MI HvxWR:$src1, IntRegs:$src2, imm:$src3)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, - IntRegs:$src2, imm:$src3), - (MI HvxWR:$src1, IntRegs:$src2, imm:$src3)>; -} - -multiclass T_WWRI_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, imm:$src4), - (MI HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, imm:$src4)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxWR:$src2, - IntRegs:$src3, imm:$src4), - (MI HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, imm:$src4)>; -} - -multiclass T_VVVR_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4), - (MI HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2, - HvxVR:$src3, IntRegs:$src4), - (MI HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4)>; -} - -multiclass T_WVVR_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4), - (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4)>; - - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2, - HvxVR:$src3, IntRegs:$src4), - (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4)>; -} - -defm : T_WR_pat <V6_vtmpyb, int_hexagon_V6_vtmpyb>; -defm : T_WR_pat <V6_vtmpybus, int_hexagon_V6_vtmpybus>; -defm : T_VR_pat <V6_vdmpyhb, int_hexagon_V6_vdmpyhb>; -defm : T_VR_pat <V6_vrmpyub, int_hexagon_V6_vrmpyub>; -defm : T_VR_pat <V6_vrmpybus, int_hexagon_V6_vrmpybus>; -defm : T_WR_pat <V6_vdsaduh, int_hexagon_V6_vdsaduh>; -defm : T_VR_pat <V6_vdmpybus, int_hexagon_V6_vdmpybus>; -defm : T_WR_pat <V6_vdmpybus_dv, int_hexagon_V6_vdmpybus_dv>; -defm : T_VR_pat <V6_vdmpyhsusat, int_hexagon_V6_vdmpyhsusat>; -defm : T_WR_pat <V6_vdmpyhsuisat, int_hexagon_V6_vdmpyhsuisat>; -defm : T_VR_pat <V6_vdmpyhsat, int_hexagon_V6_vdmpyhsat>; -defm : T_WR_pat <V6_vdmpyhisat, int_hexagon_V6_vdmpyhisat>; -defm : T_WR_pat <V6_vdmpyhb_dv, int_hexagon_V6_vdmpyhb_dv>; -defm : T_VR_pat <V6_vmpybus, int_hexagon_V6_vmpybus>; -defm : T_WR_pat <V6_vmpabus, int_hexagon_V6_vmpabus>; -defm : T_WR_pat <V6_vmpahb, int_hexagon_V6_vmpahb>; -defm : T_VR_pat <V6_vmpyh, int_hexagon_V6_vmpyh>; -defm : T_VR_pat <V6_vmpyhss, int_hexagon_V6_vmpyhss>; -defm : T_VR_pat <V6_vmpyhsrs, int_hexagon_V6_vmpyhsrs>; -defm : T_VR_pat <V6_vmpyuh, int_hexagon_V6_vmpyuh>; -defm : T_VR_pat <V6_vmpyihb, int_hexagon_V6_vmpyihb>; -defm : T_VR_pat <V6_vror, int_hexagon_V6_vror>; -defm : T_VR_pat <V6_vasrw, int_hexagon_V6_vasrw>; -defm : T_VR_pat <V6_vasrh, int_hexagon_V6_vasrh>; -defm : T_VR_pat <V6_vaslw, int_hexagon_V6_vaslw>; -defm : T_VR_pat <V6_vaslh, int_hexagon_V6_vaslh>; -defm : T_VR_pat <V6_vlsrw, int_hexagon_V6_vlsrw>; -defm : T_VR_pat <V6_vlsrh, int_hexagon_V6_vlsrh>; -defm : T_VR_pat <V6_vmpyiwh, int_hexagon_V6_vmpyiwh>; -defm : T_VR_pat <V6_vmpyiwb, int_hexagon_V6_vmpyiwb>; -defm : T_WR_pat <V6_vtmpyhb, int_hexagon_V6_vtmpyhb>; -defm : T_VR_pat <V6_vmpyub, int_hexagon_V6_vmpyub>; - -defm : T_VV_pat <V6_vrmpyubv, int_hexagon_V6_vrmpyubv>; -defm : T_VV_pat <V6_vrmpybv, int_hexagon_V6_vrmpybv>; -defm : T_VV_pat <V6_vrmpybusv, int_hexagon_V6_vrmpybusv>; -defm : T_VV_pat <V6_vdmpyhvsat, int_hexagon_V6_vdmpyhvsat>; -defm : T_VV_pat <V6_vmpybv, int_hexagon_V6_vmpybv>; -defm : T_VV_pat <V6_vmpyubv, int_hexagon_V6_vmpyubv>; -defm : T_VV_pat <V6_vmpybusv, int_hexagon_V6_vmpybusv>; -defm : T_VV_pat <V6_vmpyhv, int_hexagon_V6_vmpyhv>; -defm : T_VV_pat <V6_vmpyuhv, int_hexagon_V6_vmpyuhv>; -defm : T_VV_pat <V6_vmpyhvsrs, int_hexagon_V6_vmpyhvsrs>; -defm : T_VV_pat <V6_vmpyhus, int_hexagon_V6_vmpyhus>; -defm : T_WW_pat <V6_vmpabusv, int_hexagon_V6_vmpabusv>; -defm : T_VV_pat <V6_vmpyih, int_hexagon_V6_vmpyih>; -defm : T_VV_pat <V6_vand, int_hexagon_V6_vand>; -defm : T_VV_pat <V6_vor, int_hexagon_V6_vor>; -defm : T_VV_pat <V6_vxor, int_hexagon_V6_vxor>; -defm : T_VV_pat <V6_vaddw, int_hexagon_V6_vaddw>; -defm : T_VV_pat <V6_vaddubsat, int_hexagon_V6_vaddubsat>; -defm : T_VV_pat <V6_vadduhsat, int_hexagon_V6_vadduhsat>; -defm : T_VV_pat <V6_vaddhsat, int_hexagon_V6_vaddhsat>; -defm : T_VV_pat <V6_vaddwsat, int_hexagon_V6_vaddwsat>; -defm : T_VV_pat <V6_vsubb, int_hexagon_V6_vsubb>; -defm : T_VV_pat <V6_vsubh, int_hexagon_V6_vsubh>; -defm : T_VV_pat <V6_vsubw, int_hexagon_V6_vsubw>; -defm : T_VV_pat <V6_vsububsat, int_hexagon_V6_vsububsat>; -defm : T_VV_pat <V6_vsubuhsat, int_hexagon_V6_vsubuhsat>; -defm : T_VV_pat <V6_vsubhsat, int_hexagon_V6_vsubhsat>; -defm : T_VV_pat <V6_vsubwsat, int_hexagon_V6_vsubwsat>; -defm : T_WW_pat <V6_vaddb_dv, int_hexagon_V6_vaddb_dv>; -defm : T_WW_pat <V6_vaddh_dv, int_hexagon_V6_vaddh_dv>; -defm : T_WW_pat <V6_vaddw_dv, int_hexagon_V6_vaddw_dv>; -defm : T_WW_pat <V6_vaddubsat_dv, int_hexagon_V6_vaddubsat_dv>; -defm : T_WW_pat <V6_vadduhsat_dv, int_hexagon_V6_vadduhsat_dv>; -defm : T_WW_pat <V6_vaddhsat_dv, int_hexagon_V6_vaddhsat_dv>; -defm : T_WW_pat <V6_vaddwsat_dv, int_hexagon_V6_vaddwsat_dv>; -defm : T_WW_pat <V6_vsubb_dv, int_hexagon_V6_vsubb_dv>; -defm : T_WW_pat <V6_vsubh_dv, int_hexagon_V6_vsubh_dv>; -defm : T_WW_pat <V6_vsubw_dv, int_hexagon_V6_vsubw_dv>; -defm : T_WW_pat <V6_vsububsat_dv, int_hexagon_V6_vsububsat_dv>; -defm : T_WW_pat <V6_vsubuhsat_dv, int_hexagon_V6_vsubuhsat_dv>; -defm : T_WW_pat <V6_vsubhsat_dv, int_hexagon_V6_vsubhsat_dv>; -defm : T_WW_pat <V6_vsubwsat_dv, int_hexagon_V6_vsubwsat_dv>; -defm : T_VV_pat <V6_vaddubh, int_hexagon_V6_vaddubh>; -defm : T_VV_pat <V6_vadduhw, int_hexagon_V6_vadduhw>; -defm : T_VV_pat <V6_vaddhw, int_hexagon_V6_vaddhw>; -defm : T_VV_pat <V6_vsububh, int_hexagon_V6_vsububh>; -defm : T_VV_pat <V6_vsubuhw, int_hexagon_V6_vsubuhw>; -defm : T_VV_pat <V6_vsubhw, int_hexagon_V6_vsubhw>; -defm : T_VV_pat <V6_vabsdiffub, int_hexagon_V6_vabsdiffub>; -defm : T_VV_pat <V6_vabsdiffh, int_hexagon_V6_vabsdiffh>; -defm : T_VV_pat <V6_vabsdiffuh, int_hexagon_V6_vabsdiffuh>; -defm : T_VV_pat <V6_vabsdiffw, int_hexagon_V6_vabsdiffw>; -defm : T_VV_pat <V6_vavgub, int_hexagon_V6_vavgub>; -defm : T_VV_pat <V6_vavguh, int_hexagon_V6_vavguh>; -defm : T_VV_pat <V6_vavgh, int_hexagon_V6_vavgh>; -defm : T_VV_pat <V6_vavgw, int_hexagon_V6_vavgw>; -defm : T_VV_pat <V6_vnavgub, int_hexagon_V6_vnavgub>; -defm : T_VV_pat <V6_vnavgh, int_hexagon_V6_vnavgh>; -defm : T_VV_pat <V6_vnavgw, int_hexagon_V6_vnavgw>; -defm : T_VV_pat <V6_vavgubrnd, int_hexagon_V6_vavgubrnd>; -defm : T_VV_pat <V6_vavguhrnd, int_hexagon_V6_vavguhrnd>; -defm : T_VV_pat <V6_vavghrnd, int_hexagon_V6_vavghrnd>; -defm : T_VV_pat <V6_vavgwrnd, int_hexagon_V6_vavgwrnd>; -defm : T_WW_pat <V6_vmpabuuv, int_hexagon_V6_vmpabuuv>; - -defm : T_VVR_pat <V6_vdmpyhb_acc, int_hexagon_V6_vdmpyhb_acc>; -defm : T_VVR_pat <V6_vrmpyub_acc, int_hexagon_V6_vrmpyub_acc>; -defm : T_VVR_pat <V6_vrmpybus_acc, int_hexagon_V6_vrmpybus_acc>; -defm : T_VVR_pat <V6_vdmpybus_acc, int_hexagon_V6_vdmpybus_acc>; -defm : T_VVR_pat <V6_vdmpyhsusat_acc, int_hexagon_V6_vdmpyhsusat_acc>; -defm : T_VVR_pat <V6_vdmpyhsat_acc, int_hexagon_V6_vdmpyhsat_acc>; -defm : T_VVR_pat <V6_vmpyiwb_acc, int_hexagon_V6_vmpyiwb_acc>; -defm : T_VVR_pat <V6_vmpyiwh_acc, int_hexagon_V6_vmpyiwh_acc>; -defm : T_VVR_pat <V6_vmpyihb_acc, int_hexagon_V6_vmpyihb_acc>; -defm : T_VVR_pat <V6_vaslw_acc, int_hexagon_V6_vaslw_acc>; -defm : T_VVR_pat <V6_vasrw_acc, int_hexagon_V6_vasrw_acc>; - -defm : T_VWR_pat <V6_vdmpyhsuisat_acc, int_hexagon_V6_vdmpyhsuisat_acc>; -defm : T_VWR_pat <V6_vdmpyhisat_acc, int_hexagon_V6_vdmpyhisat_acc>; - -defm : T_WVR_pat <V6_vmpybus_acc, int_hexagon_V6_vmpybus_acc>; -defm : T_WVR_pat <V6_vmpyhsat_acc, int_hexagon_V6_vmpyhsat_acc>; -defm : T_WVR_pat <V6_vmpyuh_acc, int_hexagon_V6_vmpyuh_acc>; -defm : T_WVR_pat <V6_vmpyub_acc, int_hexagon_V6_vmpyub_acc>; - -defm : T_WWR_pat <V6_vtmpyb_acc, int_hexagon_V6_vtmpyb_acc>; -defm : T_WWR_pat <V6_vtmpybus_acc, int_hexagon_V6_vtmpybus_acc>; -defm : T_WWR_pat <V6_vtmpyhb_acc, int_hexagon_V6_vtmpyhb_acc>; -defm : T_WWR_pat <V6_vdmpybus_dv_acc, int_hexagon_V6_vdmpybus_dv_acc>; -defm : T_WWR_pat <V6_vdmpyhb_dv_acc, int_hexagon_V6_vdmpyhb_dv_acc>; -defm : T_WWR_pat <V6_vmpabus_acc, int_hexagon_V6_vmpabus_acc>; -defm : T_WWR_pat <V6_vmpahb_acc, int_hexagon_V6_vmpahb_acc>; -defm : T_WWR_pat <V6_vdsaduh_acc, int_hexagon_V6_vdsaduh_acc>; - -defm : T_VVV_pat <V6_vdmpyhvsat_acc, int_hexagon_V6_vdmpyhvsat_acc>; -defm : T_WVV_pat <V6_vmpybusv_acc, int_hexagon_V6_vmpybusv_acc>; -defm : T_WVV_pat <V6_vmpybv_acc, int_hexagon_V6_vmpybv_acc>; -defm : T_WVV_pat <V6_vmpyhus_acc, int_hexagon_V6_vmpyhus_acc>; -defm : T_WVV_pat <V6_vmpyhv_acc, int_hexagon_V6_vmpyhv_acc>; -defm : T_VVV_pat <V6_vmpyiewh_acc, int_hexagon_V6_vmpyiewh_acc>; -defm : T_VVV_pat <V6_vmpyiewuh_acc, int_hexagon_V6_vmpyiewuh_acc>; -defm : T_VVV_pat <V6_vmpyih_acc, int_hexagon_V6_vmpyih_acc>; -defm : T_VVV_pat <V6_vmpyowh_rnd_sacc, int_hexagon_V6_vmpyowh_rnd_sacc>; -defm : T_VVV_pat <V6_vmpyowh_sacc, int_hexagon_V6_vmpyowh_sacc>; -defm : T_WVV_pat <V6_vmpyubv_acc, int_hexagon_V6_vmpyubv_acc>; -defm : T_WVV_pat <V6_vmpyuhv_acc, int_hexagon_V6_vmpyuhv_acc>; -defm : T_VVV_pat <V6_vrmpybusv_acc, int_hexagon_V6_vrmpybusv_acc>; -defm : T_VVV_pat <V6_vrmpybv_acc, int_hexagon_V6_vrmpybv_acc>; -defm : T_VVV_pat <V6_vrmpyubv_acc, int_hexagon_V6_vrmpyubv_acc>; - -// Compare instructions -defm : T_QVV_pat <V6_veqb_and, int_hexagon_V6_veqb_and>; -defm : T_QVV_pat <V6_veqh_and, int_hexagon_V6_veqh_and>; -defm : T_QVV_pat <V6_veqw_and, int_hexagon_V6_veqw_and>; -defm : T_QVV_pat <V6_vgtb_and, int_hexagon_V6_vgtb_and>; -defm : T_QVV_pat <V6_vgth_and, int_hexagon_V6_vgth_and>; -defm : T_QVV_pat <V6_vgtw_and, int_hexagon_V6_vgtw_and>; -defm : T_QVV_pat <V6_vgtub_and, int_hexagon_V6_vgtub_and>; -defm : T_QVV_pat <V6_vgtuh_and, int_hexagon_V6_vgtuh_and>; -defm : T_QVV_pat <V6_vgtuw_and, int_hexagon_V6_vgtuw_and>; -defm : T_QVV_pat <V6_veqb_or, int_hexagon_V6_veqb_or>; -defm : T_QVV_pat <V6_veqh_or, int_hexagon_V6_veqh_or>; -defm : T_QVV_pat <V6_veqw_or, int_hexagon_V6_veqw_or>; -defm : T_QVV_pat <V6_vgtb_or, int_hexagon_V6_vgtb_or>; -defm : T_QVV_pat <V6_vgth_or, int_hexagon_V6_vgth_or>; -defm : T_QVV_pat <V6_vgtw_or, int_hexagon_V6_vgtw_or>; -defm : T_QVV_pat <V6_vgtub_or, int_hexagon_V6_vgtub_or>; -defm : T_QVV_pat <V6_vgtuh_or, int_hexagon_V6_vgtuh_or>; -defm : T_QVV_pat <V6_vgtuw_or, int_hexagon_V6_vgtuw_or>; -defm : T_QVV_pat <V6_veqb_xor, int_hexagon_V6_veqb_xor>; -defm : T_QVV_pat <V6_veqh_xor, int_hexagon_V6_veqh_xor>; -defm : T_QVV_pat <V6_veqw_xor, int_hexagon_V6_veqw_xor>; -defm : T_QVV_pat <V6_vgtb_xor, int_hexagon_V6_vgtb_xor>; -defm : T_QVV_pat <V6_vgth_xor, int_hexagon_V6_vgth_xor>; -defm : T_QVV_pat <V6_vgtw_xor, int_hexagon_V6_vgtw_xor>; -defm : T_QVV_pat <V6_vgtub_xor, int_hexagon_V6_vgtub_xor>; -defm : T_QVV_pat <V6_vgtuh_xor, int_hexagon_V6_vgtuh_xor>; -defm : T_QVV_pat <V6_vgtuw_xor, int_hexagon_V6_vgtuw_xor>; - -defm : T_VV_pat <V6_vminub, int_hexagon_V6_vminub>; -defm : T_VV_pat <V6_vminuh, int_hexagon_V6_vminuh>; -defm : T_VV_pat <V6_vminh, int_hexagon_V6_vminh>; -defm : T_VV_pat <V6_vminw, int_hexagon_V6_vminw>; -defm : T_VV_pat <V6_vmaxub, int_hexagon_V6_vmaxub>; -defm : T_VV_pat <V6_vmaxuh, int_hexagon_V6_vmaxuh>; -defm : T_VV_pat <V6_vmaxh, int_hexagon_V6_vmaxh>; -defm : T_VV_pat <V6_vmaxw, int_hexagon_V6_vmaxw>; -defm : T_VV_pat <V6_vdelta, int_hexagon_V6_vdelta>; -defm : T_VV_pat <V6_vrdelta, int_hexagon_V6_vrdelta>; -defm : T_VV_pat <V6_vdealb4w, int_hexagon_V6_vdealb4w>; -defm : T_VV_pat <V6_vmpyowh_rnd, int_hexagon_V6_vmpyowh_rnd>; -defm : T_VV_pat <V6_vshuffeb, int_hexagon_V6_vshuffeb>; -defm : T_VV_pat <V6_vshuffob, int_hexagon_V6_vshuffob>; -defm : T_VV_pat <V6_vshufeh, int_hexagon_V6_vshufeh>; -defm : T_VV_pat <V6_vshufoh, int_hexagon_V6_vshufoh>; -defm : T_VV_pat <V6_vshufoeh, int_hexagon_V6_vshufoeh>; -defm : T_VV_pat <V6_vshufoeb, int_hexagon_V6_vshufoeb>; -defm : T_VV_pat <V6_vcombine, int_hexagon_V6_vcombine>; -defm : T_VV_pat <V6_vmpyieoh, int_hexagon_V6_vmpyieoh>; -defm : T_VV_pat <V6_vsathub, int_hexagon_V6_vsathub>; -defm : T_VV_pat <V6_vsatwh, int_hexagon_V6_vsatwh>; -defm : T_VV_pat <V6_vroundwh, int_hexagon_V6_vroundwh>; -defm : T_VV_pat <V6_vroundwuh, int_hexagon_V6_vroundwuh>; -defm : T_VV_pat <V6_vroundhb, int_hexagon_V6_vroundhb>; -defm : T_VV_pat <V6_vroundhub, int_hexagon_V6_vroundhub>; -defm : T_VV_pat <V6_vasrwv, int_hexagon_V6_vasrwv>; -defm : T_VV_pat <V6_vlsrwv, int_hexagon_V6_vlsrwv>; -defm : T_VV_pat <V6_vlsrhv, int_hexagon_V6_vlsrhv>; -defm : T_VV_pat <V6_vasrhv, int_hexagon_V6_vasrhv>; -defm : T_VV_pat <V6_vaslwv, int_hexagon_V6_vaslwv>; -defm : T_VV_pat <V6_vaslhv, int_hexagon_V6_vaslhv>; -defm : T_VV_pat <V6_vaddb, int_hexagon_V6_vaddb>; -defm : T_VV_pat <V6_vaddh, int_hexagon_V6_vaddh>; -defm : T_VV_pat <V6_vmpyiewuh, int_hexagon_V6_vmpyiewuh>; -defm : T_VV_pat <V6_vmpyiowh, int_hexagon_V6_vmpyiowh>; -defm : T_VV_pat <V6_vpackeb, int_hexagon_V6_vpackeb>; -defm : T_VV_pat <V6_vpackeh, int_hexagon_V6_vpackeh>; -defm : T_VV_pat <V6_vpackhub_sat, int_hexagon_V6_vpackhub_sat>; -defm : T_VV_pat <V6_vpackhb_sat, int_hexagon_V6_vpackhb_sat>; -defm : T_VV_pat <V6_vpackwuh_sat, int_hexagon_V6_vpackwuh_sat>; -defm : T_VV_pat <V6_vpackwh_sat, int_hexagon_V6_vpackwh_sat>; -defm : T_VV_pat <V6_vpackob, int_hexagon_V6_vpackob>; -defm : T_VV_pat <V6_vpackoh, int_hexagon_V6_vpackoh>; -defm : T_VV_pat <V6_vmpyewuh, int_hexagon_V6_vmpyewuh>; -defm : T_VV_pat <V6_vmpyowh, int_hexagon_V6_vmpyowh>; - -defm : T_QVV_pat <V6_vaddbq, int_hexagon_V6_vaddbq>; -defm : T_QVV_pat <V6_vaddhq, int_hexagon_V6_vaddhq>; -defm : T_QVV_pat <V6_vaddwq, int_hexagon_V6_vaddwq>; -defm : T_QVV_pat <V6_vaddbnq, int_hexagon_V6_vaddbnq>; -defm : T_QVV_pat <V6_vaddhnq, int_hexagon_V6_vaddhnq>; -defm : T_QVV_pat <V6_vaddwnq, int_hexagon_V6_vaddwnq>; -defm : T_QVV_pat <V6_vsubbq, int_hexagon_V6_vsubbq>; -defm : T_QVV_pat <V6_vsubhq, int_hexagon_V6_vsubhq>; -defm : T_QVV_pat <V6_vsubwq, int_hexagon_V6_vsubwq>; -defm : T_QVV_pat <V6_vsubbnq, int_hexagon_V6_vsubbnq>; -defm : T_QVV_pat <V6_vsubhnq, int_hexagon_V6_vsubhnq>; -defm : T_QVV_pat <V6_vsubwnq, int_hexagon_V6_vsubwnq>; - -defm : T_V_pat <V6_vabsh, int_hexagon_V6_vabsh>; -defm : T_V_pat <V6_vabsw, int_hexagon_V6_vabsw>; -defm : T_V_pat <V6_vabsw_sat, int_hexagon_V6_vabsw_sat>; -defm : T_V_pat <V6_vabsh_sat, int_hexagon_V6_vabsh_sat>; -defm : T_V_pat <V6_vnot, int_hexagon_V6_vnot>; -defm : T_V_pat <V6_vassign, int_hexagon_V6_vassign>; -defm : T_V_pat <V6_vzb, int_hexagon_V6_vzb>; -defm : T_V_pat <V6_vzh, int_hexagon_V6_vzh>; -defm : T_V_pat <V6_vsb, int_hexagon_V6_vsb>; -defm : T_V_pat <V6_vsh, int_hexagon_V6_vsh>; -defm : T_V_pat <V6_vdealh, int_hexagon_V6_vdealh>; -defm : T_V_pat <V6_vdealb, int_hexagon_V6_vdealb>; -defm : T_V_pat <V6_vunpackub, int_hexagon_V6_vunpackub>; -defm : T_V_pat <V6_vunpackuh, int_hexagon_V6_vunpackuh>; -defm : T_V_pat <V6_vunpackb, int_hexagon_V6_vunpackb>; -defm : T_V_pat <V6_vunpackh, int_hexagon_V6_vunpackh>; -defm : T_V_pat <V6_vshuffh, int_hexagon_V6_vshuffh>; -defm : T_V_pat <V6_vshuffb, int_hexagon_V6_vshuffb>; -defm : T_V_pat <V6_vcl0w, int_hexagon_V6_vcl0w>; -defm : T_V_pat <V6_vpopcounth, int_hexagon_V6_vpopcounth>; -defm : T_V_pat <V6_vcl0h, int_hexagon_V6_vcl0h>; -defm : T_V_pat <V6_vnormamtw, int_hexagon_V6_vnormamtw>; -defm : T_V_pat <V6_vnormamth, int_hexagon_V6_vnormamth>; - -defm : T_W_pat <V6_lo, int_hexagon_V6_lo>; -defm : T_W_pat <V6_hi, int_hexagon_V6_hi>; -defm : T_W_pat <V6_vassignp, int_hexagon_V6_vassignp>; - -defm : T_WRI_pat <V6_vrmpybusi, int_hexagon_V6_vrmpybusi>; -defm : T_WRI_pat <V6_vrsadubi, int_hexagon_V6_vrsadubi>; -defm : T_WRI_pat <V6_vrmpyubi, int_hexagon_V6_vrmpyubi>; - -defm : T_WWRI_pat <V6_vrmpybusi_acc, int_hexagon_V6_vrmpybusi_acc>; -defm : T_WWRI_pat <V6_vrsadubi_acc, int_hexagon_V6_vrsadubi_acc>; -defm : T_WWRI_pat <V6_vrmpyubi_acc, int_hexagon_V6_vrmpyubi_acc>; - -// assembler mapped. -//defm : T_V_pat <V6_vtran2x2, int_hexagon_V6_vtran2x2>; -// not present earlier.. need to add intrinsic -defm : T_VVR_pat <V6_valignb, int_hexagon_V6_valignb>; -defm : T_VVR_pat <V6_vlalignb, int_hexagon_V6_vlalignb>; -defm : T_VVR_pat <V6_vasrwh, int_hexagon_V6_vasrwh>; -defm : T_VVR_pat <V6_vasrwhsat, int_hexagon_V6_vasrwhsat>; -defm : T_VVR_pat <V6_vasrwhrndsat, int_hexagon_V6_vasrwhrndsat>; -defm : T_VVR_pat <V6_vasrwuhsat, int_hexagon_V6_vasrwuhsat>; -defm : T_VVR_pat <V6_vasrhubsat, int_hexagon_V6_vasrhubsat>; -defm : T_VVR_pat <V6_vasrhubrndsat, int_hexagon_V6_vasrhubrndsat>; -defm : T_VVR_pat <V6_vasrhbrndsat, int_hexagon_V6_vasrhbrndsat>; - -defm : T_VVR_pat <V6_vshuffvdd, int_hexagon_V6_vshuffvdd>; -defm : T_VVR_pat <V6_vdealvdd, int_hexagon_V6_vdealvdd>; - -defm : T_WV_pat <V6_vunpackob, int_hexagon_V6_vunpackob>; -defm : T_WV_pat <V6_vunpackoh, int_hexagon_V6_vunpackoh>; -defm : T_VVI_pat <V6_valignbi, int_hexagon_V6_valignbi>; -defm : T_VVI_pat <V6_vlalignbi, int_hexagon_V6_vlalignbi>; - -defm : T_QVV_pat <V6_vswap, int_hexagon_V6_vswap>; -defm : T_QVV_pat <V6_vmux, int_hexagon_V6_vmux>; -defm : T_QQ_pat <V6_pred_and, int_hexagon_V6_pred_and>; -defm : T_QQ_pat <V6_pred_or, int_hexagon_V6_pred_or>; -defm : T_Q_pat <V6_pred_not, int_hexagon_V6_pred_not>; -defm : T_QQ_pat <V6_pred_xor, int_hexagon_V6_pred_xor>; -defm : T_QQ_pat <V6_pred_or_n, int_hexagon_V6_pred_or_n>; -defm : T_QQ_pat <V6_pred_and_n, int_hexagon_V6_pred_and_n>; -defm : T_VV_pat <V6_veqb, int_hexagon_V6_veqb>; -defm : T_VV_pat <V6_veqh, int_hexagon_V6_veqh>; -defm : T_VV_pat <V6_veqw, int_hexagon_V6_veqw>; -defm : T_VV_pat <V6_vgtb, int_hexagon_V6_vgtb>; -defm : T_VV_pat <V6_vgth, int_hexagon_V6_vgth>; -defm : T_VV_pat <V6_vgtw, int_hexagon_V6_vgtw>; -defm : T_VV_pat <V6_vgtub, int_hexagon_V6_vgtub>; -defm : T_VV_pat <V6_vgtuh, int_hexagon_V6_vgtuh>; -defm : T_VV_pat <V6_vgtuw, int_hexagon_V6_vgtuw>; - -defm : T_VQR_pat <V6_vandqrt_acc, int_hexagon_V6_vandqrt_acc>; -defm : T_QVR_pat <V6_vandvrt_acc, int_hexagon_V6_vandvrt_acc>; -defm : T_QR_pat <V6_vandqrt, int_hexagon_V6_vandqrt>; -defm : T_R_pat <V6_lvsplatw, int_hexagon_V6_lvsplatw>; -defm : T_R_pat <V6_pred_scalar2, int_hexagon_V6_pred_scalar2>; -defm : T_VR_pat <V6_vandvrt, int_hexagon_V6_vandvrt>; - -defm : T_VVR_pat <V6_vlutvvb, int_hexagon_V6_vlutvvb>; -defm : T_VVR_pat <V6_vlutvwh, int_hexagon_V6_vlutvwh>; -defm : T_VVVR_pat <V6_vlutvvb_oracc, int_hexagon_V6_vlutvvb_oracc>; -defm : T_WVVR_pat <V6_vlutvwh_oracc, int_hexagon_V6_vlutvwh_oracc>; - -defm : T_QVR_pat <V6_vandvrt_acc, int_hexagon_V6_vandvrt_acc>; -def : T_PI_pat <S6_rol_i_p, int_hexagon_S6_rol_i_p>; -def : T_RI_pat <S6_rol_i_r, int_hexagon_S6_rol_i_r>; -def : T_PPI_pat <S6_rol_i_p_nac, int_hexagon_S6_rol_i_p_nac>; -def : T_PPI_pat <S6_rol_i_p_acc, int_hexagon_S6_rol_i_p_acc>; -def : T_PPI_pat <S6_rol_i_p_and, int_hexagon_S6_rol_i_p_and>; -def : T_PPI_pat <S6_rol_i_p_or, int_hexagon_S6_rol_i_p_or>; -def : T_PPI_pat <S6_rol_i_p_xacc, int_hexagon_S6_rol_i_p_xacc>; -def : T_RRI_pat <S6_rol_i_r_nac, int_hexagon_S6_rol_i_r_nac>; -def : T_RRI_pat <S6_rol_i_r_acc, int_hexagon_S6_rol_i_r_acc>; -def : T_RRI_pat <S6_rol_i_r_and, int_hexagon_S6_rol_i_r_and>; -def : T_RRI_pat <S6_rol_i_r_or, int_hexagon_S6_rol_i_r_or>; -def : T_RRI_pat <S6_rol_i_r_xacc, int_hexagon_S6_rol_i_r_xacc>; - -defm : T_VR_pat <V6_extractw, int_hexagon_V6_extractw>; -defm : T_VR_pat <V6_vinsertwr, int_hexagon_V6_vinsertwr>; - -//def : T_PPQ_pat <S2_cabacencbin, int_hexagon_S2_cabacencbin>; - -def: Pat<(v64i16 (trunc v64i32:$Vdd)), - (v64i16 (V6_vpackwh_sat - (v32i32 (V6_hi HvxWR:$Vdd)), - (v32i32 (V6_lo HvxWR:$Vdd))))>; - -def: Pat<(int_hexagon_V6_vd0), (V6_vd0)>; -def: Pat<(int_hexagon_V6_vd0_128B), (V6_vd0)>; - diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index c2eb24b..c34eecd 100644 --- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -38,7 +38,6 @@ #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsHexagon.h" #include "llvm/IR/Module.h" diff --git a/llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td b/llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td deleted file mode 100644 index 2fcefe6..0000000 --- a/llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td +++ /dev/null @@ -1,179 +0,0 @@ -//===--- HexagonMapAsm2IntrinV62.gen.td -----------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -multiclass T_VR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxVR:$src1, IntRegs:$src2), - (MI HvxVR:$src1, IntRegs:$src2)>; - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, IntRegs:$src2), - (MI HvxVR:$src1, IntRegs:$src2)>; -} - -multiclass T_VVL_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3), - (MI HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>; - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2, - IntRegsLow8:$src3), - (MI HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>; -} - -multiclass T_VV_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxVR:$src1, HvxVR:$src2), - (MI HvxVR:$src1, HvxVR:$src2)>; - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2), - (MI HvxVR:$src1, HvxVR:$src2)>; -} - -multiclass T_WW_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxWR:$src1, HvxWR:$src2), - (MI HvxWR:$src1, HvxWR:$src2)>; - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxWR:$src2), - (MI HvxWR:$src1, HvxWR:$src2)>; -} - -multiclass T_WVV_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), - (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>; - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2, - HvxVR:$src3), - (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>; -} - -multiclass T_WR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxWR:$src1, IntRegs:$src2), - (MI HvxWR:$src1, IntRegs:$src2)>; - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, IntRegs:$src2), - (MI HvxWR:$src1, IntRegs:$src2)>; -} - -multiclass T_WWR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxWR:$src1, HvxWR:$src2, IntRegs:$src3), - (MI HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>; - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxWR:$src2, - IntRegs:$src3), - (MI HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>; -} - -multiclass T_VVR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, IntRegs:$src3), - (MI HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>; - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2, - IntRegs:$src3), - (MI HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>; -} - -multiclass T_ZR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxQR:$src1, IntRegs:$src2), - (MI HvxQR:$src1, IntRegs:$src2)>; - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, IntRegs:$src2), - (MI HvxQR:$src1, IntRegs:$src2)>; -} - -multiclass T_VZR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxVR:$src1, HvxQR:$src2, IntRegs:$src3), - (MI HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>; - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxQR:$src2, - IntRegs:$src3), - (MI HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>; -} - -multiclass T_ZV_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxQR:$src1, HvxVR:$src2), - (MI HvxQR:$src1, HvxVR:$src2)>; - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, HvxVR:$src2), - (MI HvxQR:$src1, HvxVR:$src2)>; -} - -multiclass T_R_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID IntRegs:$src1), - (MI IntRegs:$src1)>; - def: Pat<(!cast<Intrinsic>(IntID#"_128B") IntRegs:$src1), - (MI IntRegs:$src1)>; -} - -multiclass T_ZZ_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxQR:$src1, HvxQR:$src2), - (MI HvxQR:$src1, HvxQR:$src2)>; - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, HvxQR:$src2), - (MI HvxQR:$src1, HvxQR:$src2)>; -} - -multiclass T_VVI_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, imm:$src3), - (MI HvxVR:$src1, HvxVR:$src2, imm:$src3)>; - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2, - imm:$src3), - (MI HvxVR:$src1, HvxVR:$src2, imm:$src3)>; -} - -multiclass T_VVVI_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4), - (MI HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4)>; - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2, - HvxVR:$src3, imm:$src4), - (MI HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4)>; -} - -multiclass T_WVVI_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> { - def: Pat<(IntID HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4), - (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4)>; - def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2, - HvxVR:$src3, imm:$src4), - (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4)>; -} - -def : T_R_pat <S6_vsplatrbp, int_hexagon_S6_vsplatrbp>; -def : T_PP_pat <M6_vabsdiffb, int_hexagon_M6_vabsdiffb>; -def : T_PP_pat <M6_vabsdiffub, int_hexagon_M6_vabsdiffub>; -def : T_PP_pat <S6_vtrunehb_ppp, int_hexagon_S6_vtrunehb_ppp>; -def : T_PP_pat <S6_vtrunohb_ppp, int_hexagon_S6_vtrunohb_ppp>; - -defm : T_VR_HVX_gen_pat <V6_vlsrb, int_hexagon_V6_vlsrb>; -defm : T_VR_HVX_gen_pat <V6_vmpyiwub, int_hexagon_V6_vmpyiwub>; -defm : T_VVL_HVX_gen_pat <V6_vasrwuhrndsat, int_hexagon_V6_vasrwuhrndsat>; -defm : T_VVL_HVX_gen_pat <V6_vasruwuhrndsat, int_hexagon_V6_vasruwuhrndsat>; -defm : T_VVL_HVX_gen_pat <V6_vasrhbsat, int_hexagon_V6_vasrhbsat>; -defm : T_VVL_HVX_gen_pat <V6_vlutvvb_nm, int_hexagon_V6_vlutvvb_nm>; -defm : T_VVL_HVX_gen_pat <V6_vlutvwh_nm, int_hexagon_V6_vlutvwh_nm>; -defm : T_VV_HVX_gen_pat <V6_vrounduwuh, int_hexagon_V6_vrounduwuh>; -defm : T_VV_HVX_gen_pat <V6_vrounduhub, int_hexagon_V6_vrounduhub>; -defm : T_VV_HVX_gen_pat <V6_vadduwsat, int_hexagon_V6_vadduwsat>; -defm : T_VV_HVX_gen_pat <V6_vsubuwsat, int_hexagon_V6_vsubuwsat>; -defm : T_VV_HVX_gen_pat <V6_vaddbsat, int_hexagon_V6_vaddbsat>; -defm : T_VV_HVX_gen_pat <V6_vsubbsat, int_hexagon_V6_vsubbsat>; -defm : T_VV_HVX_gen_pat <V6_vaddububb_sat, int_hexagon_V6_vaddububb_sat>; -defm : T_VV_HVX_gen_pat <V6_vsubububb_sat, int_hexagon_V6_vsubububb_sat>; -defm : T_VV_HVX_gen_pat <V6_vmpyewuh_64, int_hexagon_V6_vmpyewuh_64>; -defm : T_VV_HVX_gen_pat <V6_vmaxb, int_hexagon_V6_vmaxb>; -defm : T_VV_HVX_gen_pat <V6_vminb, int_hexagon_V6_vminb>; -defm : T_VV_HVX_gen_pat <V6_vsatuwuh, int_hexagon_V6_vsatuwuh>; -defm : T_VV_HVX_gen_pat <V6_vaddclbw, int_hexagon_V6_vaddclbw>; -defm : T_VV_HVX_gen_pat <V6_vaddclbh, int_hexagon_V6_vaddclbh>; -defm : T_WW_HVX_gen_pat <V6_vadduwsat_dv, int_hexagon_V6_vadduwsat_dv>; -defm : T_WW_HVX_gen_pat <V6_vsubuwsat_dv, int_hexagon_V6_vsubuwsat_dv>; -defm : T_WW_HVX_gen_pat <V6_vaddbsat_dv, int_hexagon_V6_vaddbsat_dv>; -defm : T_WW_HVX_gen_pat <V6_vsubbsat_dv, int_hexagon_V6_vsubbsat_dv>; -defm : T_WVV_HVX_gen_pat <V6_vaddhw_acc, int_hexagon_V6_vaddhw_acc>; -defm : T_WVV_HVX_gen_pat <V6_vadduhw_acc, int_hexagon_V6_vadduhw_acc>; -defm : T_WVV_HVX_gen_pat <V6_vaddubh_acc, int_hexagon_V6_vaddubh_acc>; -defm : T_WVV_HVX_gen_pat <V6_vmpyowh_64_acc, int_hexagon_V6_vmpyowh_64_acc>; -defm : T_WR_HVX_gen_pat <V6_vmpauhb, int_hexagon_V6_vmpauhb>; -defm : T_WWR_HVX_gen_pat <V6_vmpauhb_acc, int_hexagon_V6_vmpauhb_acc>; -defm : T_VVR_HVX_gen_pat <V6_vmpyiwub_acc, int_hexagon_V6_vmpyiwub_acc>; -defm : T_ZR_HVX_gen_pat <V6_vandnqrt, int_hexagon_V6_vandnqrt>; -defm : T_VZR_HVX_gen_pat <V6_vandnqrt_acc, int_hexagon_V6_vandnqrt_acc>; -defm : T_ZV_HVX_gen_pat <V6_vandvqv, int_hexagon_V6_vandvqv>; -defm : T_ZV_HVX_gen_pat <V6_vandvnqv, int_hexagon_V6_vandvnqv>; -defm : T_R_HVX_gen_pat <V6_pred_scalar2v2, int_hexagon_V6_pred_scalar2v2>; -defm : T_R_HVX_gen_pat <V6_lvsplath, int_hexagon_V6_lvsplath>; -defm : T_R_HVX_gen_pat <V6_lvsplatb, int_hexagon_V6_lvsplatb>; -defm : T_ZZ_HVX_gen_pat <V6_shuffeqw, int_hexagon_V6_shuffeqw>; -defm : T_ZZ_HVX_gen_pat <V6_shuffeqh, int_hexagon_V6_shuffeqh>; -defm : T_VVI_HVX_gen_pat <V6_vlutvvbi, int_hexagon_V6_vlutvvbi>; -defm : T_VVI_HVX_gen_pat <V6_vlutvwhi, int_hexagon_V6_vlutvwhi>; -defm : T_VVVI_HVX_gen_pat <V6_vlutvvb_oracci, int_hexagon_V6_vlutvvb_oracci>; -defm : T_WVVI_HVX_gen_pat <V6_vlutvwh_oracci, int_hexagon_V6_vlutvwh_oracci>; diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index a31fa57..e915a3c4 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -2514,8 +2514,9 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op, assert(ResTy.isVector()); unsigned NumElts = ResTy.getVectorNumElements(); - SDValue Vector = DAG.getUNDEF(ResTy); - for (unsigned i = 0; i < NumElts; ++i) { + SDValue Vector = + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ResTy, Node->getOperand(0)); + for (unsigned i = 1; i < NumElts; ++i) { Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector, Node->getOperand(i), DAG.getConstant(i, DL, Subtarget.getGRLenVT())); diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index a0107e4..5096a8f 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -1651,18 +1651,20 @@ def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm), (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>; def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm), (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>; -def : Pat<(vector_insert v8f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm3:$imm), - (XVINSGR2VR_W $vd, $rj, uimm3:$imm)>; -def : Pat<(vector_insert v4f64:$vd, (f64 (bitconvert i64:$rj)), uimm2:$imm), - (XVINSGR2VR_D $vd, $rj, uimm2:$imm)>; +def : Pat<(vector_insert v8f32:$xd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm3:$imm), + (XVINSGR2VR_W $xd, $rj, uimm3:$imm)>; +def : Pat<(vector_insert v4f64:$xd, (f64 (bitconvert i64:$rj)), uimm2:$imm), + (XVINSGR2VR_D $xd, $rj, uimm2:$imm)>; def : Pat<(vector_insert v8f32:$xd, (f32 (vector_extract v8f32:$xj, uimm3:$imm1)), uimm3:$imm2), (XVINSGR2VR_W $xd, (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm1), uimm3:$imm2)>; def : Pat<(vector_insert v4f64:$xd, (f64 (vector_extract v4f64:$xj, uimm2:$imm1)), uimm2:$imm2), (XVINSGR2VR_D $xd, (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm1), uimm2:$imm2)>; + +// XVINSVE0_{W/D} def : Pat<(vector_insert v8f32:$xd, FPR32:$fj, uimm3:$imm), - (XVINSGR2VR_W $xd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>; + (XVINSVE0_W $xd, (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), uimm3:$imm)>; def : Pat<(vector_insert v4f64:$xd, FPR64:$fj, uimm2:$imm), - (XVINSGR2VR_D $xd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>; + (XVINSVE0_D $xd, (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), uimm2:$imm)>; // scalar_to_vector def : Pat<(v8f32 (scalar_to_vector FPR32:$fj)), diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index 962e7c2..3c9defb 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -1842,10 +1842,19 @@ def : Pat<(vector_insert v4f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm2:$ (VINSGR2VR_W $vd, $rj, uimm2:$imm)>; def : Pat<(vector_insert v2f64:$vd, (f64 (bitconvert i64:$rj)), uimm1:$imm), (VINSGR2VR_D $vd, $rj, uimm1:$imm)>; -def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, uimm2:$imm), - (VINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm2:$imm)>; -def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, uimm1:$imm), - (VINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm1:$imm)>; + +// VEXTRINS_{W/D} +foreach imm = 0...3 in { + defvar Imm = !shl(imm, 4); + def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, imm), + (VEXTRINS_W $vd, (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), Imm)>; +} + +foreach imm = 0...1 in { + defvar Imm = !shl(imm, 4); + def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, imm), + (VEXTRINS_D $vd, (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), Imm)>; +} // scalar_to_vector def : Pat<(v4f32 (scalar_to_vector FPR32:$fj)), diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp index 03ce004..7cefb3f 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp @@ -52,6 +52,9 @@ static ABI getTripleABI(const Triple &TT) { bool Is64Bit = TT.isArch64Bit(); ABI TripleABI; switch (TT.getEnvironment()) { + case llvm::Triple::EnvironmentType::UnknownEnvironment: + TripleABI = ABI_Unknown; + break; case llvm::Triple::EnvironmentType::GNUSF: case llvm::Triple::EnvironmentType::MuslSF: TripleABI = Is64Bit ? ABI_LP64S : ABI_ILP32S; @@ -96,7 +99,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits, // 1. If the '-target-abi' is valid, use it. if (IsABIValidForFeature(ArgProvidedABI)) { - if (TT.hasEnvironment() && ArgProvidedABI != TripleABI) + if (IsABIValidForFeature(TripleABI) && ArgProvidedABI != TripleABI) errs() << "warning: triple-implied ABI conflicts with provided target-abi '" << ABIName << "', using target-abi\n"; @@ -164,10 +167,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits, return Is64Bit ? ABI_LP64F : ABI_ILP32F; return Is64Bit ? ABI_LP64S : ABI_ILP32S; }; - if (ABIName.empty()) - errs() << "warning: the triple-implied ABI is invalid, ignoring and using " - "feature-implied ABI\n"; - else + if (!ABIName.empty()) errs() << "warning: both target-abi and the triple-implied ABI are " "invalid, ignoring and using feature-implied ABI\n"; return checkABIStandardized(GetFeatureABI()); diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp index ad8f5f0..7abe9c9 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp @@ -385,11 +385,12 @@ void MipsELFObjectWriter::sortRelocs(std::vector<ELFRelocationEntry> &Relocs) { if (hasRelocationAddend()) return; - // Sort relocations by the address they are applied to. - llvm::sort(Relocs, - [](const ELFRelocationEntry &A, const ELFRelocationEntry &B) { - return A.Offset < B.Offset; - }); + // Sort relocations by r_offset. There might be more than one at an offset + // with composed relocations or .reloc directives. + llvm::stable_sort( + Relocs, [](const ELFRelocationEntry &A, const ELFRelocationEntry &B) { + return A.Offset < B.Offset; + }); // Place relocations in a list for reorder convenience. Hi16 contains the // iterators of high-part relocations. diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 7883acc..f2c2f46 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -2068,6 +2068,8 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL, SelectionDAG &DAG, unsigned Mode = NVPTX::PTXPrmtMode::NONE) { + assert(A.getValueType() == MVT::i32 && B.getValueType() == MVT::i32 && + Selector.getValueType() == MVT::i32 && "PRMT must have i32 operands"); return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32, {A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)}); } @@ -5872,6 +5874,8 @@ static SDValue combineADDRSPACECAST(SDNode *N, // details: // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) { + assert(Selector.getBitWidth() == 32 && "PRMT must have i32 operands"); + if (Mode == NVPTX::PTXPrmtMode::NONE) return Selector; @@ -5903,6 +5907,8 @@ static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) { } static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) { + assert(A.getBitWidth() == 32 && B.getBitWidth() == 32 && + Selector.getBitWidth() == 32 && "PRMT must have i32 operands"); // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}} APInt BitField = B.concat(A); APInt SelectorVal = getPRMTSelector(Selector, Mode); @@ -6537,10 +6543,13 @@ static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known, KnownBits BKnown = DAG.computeKnownBits(B, Depth); // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}} + assert(AKnown.getBitWidth() == 32 && BKnown.getBitWidth() == 32 && + "PRMT must have i32 operands"); + assert(Known.getBitWidth() == 32 && "PRMT must have i32 result"); KnownBits BitField = BKnown.concat(AKnown); APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode); - for (unsigned I : llvm::seq(std::min(4U, Known.getBitWidth() / 8))) { + for (unsigned I : llvm::seq(4)) { APInt Sel = SelectorVal.extractBits(4, I * 4); unsigned Idx = Sel.getLoBits(3).getZExtValue(); unsigned Sign = Sel.getHiBits(1).getZExtValue(); @@ -6564,3 +6573,102 @@ void NVPTXTargetLowering::computeKnownBitsForTargetNode( break; } } + +static std::pair<APInt, APInt> getPRMTDemandedBits(const APInt &SelectorVal, + const APInt &DemandedBits) { + APInt DemandedLHS = APInt(32, 0); + APInt DemandedRHS = APInt(32, 0); + + for (unsigned I : llvm::seq(4)) { + if (DemandedBits.extractBits(8, I * 8).isZero()) + continue; + + APInt Sel = SelectorVal.extractBits(4, I * 4); + unsigned Idx = Sel.getLoBits(3).getZExtValue(); + unsigned Sign = Sel.getHiBits(1).getZExtValue(); + + APInt &Src = Idx < 4 ? DemandedLHS : DemandedRHS; + unsigned ByteStart = (Idx % 4) * 8; + if (Sign) + Src.setBit(ByteStart + 7); + else + Src.setBits(ByteStart, ByteStart + 8); + } + + return {DemandedLHS, DemandedRHS}; +} + +// Replace undef with 0 as this is easier for other optimizations such as +// known bits. +static SDValue canonicalizePRMTInput(SDValue Op, SelectionDAG &DAG) { + if (!Op) + return SDValue(); + if (Op.isUndef()) + return DAG.getConstant(0, SDLoc(), MVT::i32); + return Op; +} + +static SDValue simplifyDemandedBitsForPRMT(SDValue PRMT, + const APInt &DemandedBits, + SelectionDAG &DAG, + const TargetLowering &TLI, + unsigned Depth) { + assert(PRMT.getOpcode() == NVPTXISD::PRMT); + SDValue Op0 = PRMT.getOperand(0); + SDValue Op1 = PRMT.getOperand(1); + auto *SelectorConst = dyn_cast<ConstantSDNode>(PRMT.getOperand(2)); + if (!SelectorConst) + return SDValue(); + + unsigned Mode = PRMT.getConstantOperandVal(3); + const APInt Selector = getPRMTSelector(SelectorConst->getAPIntValue(), Mode); + + // Try to simplify the PRMT to one of the inputs if the used bytes are all + // from the same input in the correct order. + const unsigned LeadingBytes = DemandedBits.countLeadingZeros() / 8; + const unsigned SelBits = (4 - LeadingBytes) * 4; + if (Selector.getLoBits(SelBits) == APInt(32, 0x3210).getLoBits(SelBits)) + return Op0; + if (Selector.getLoBits(SelBits) == APInt(32, 0x7654).getLoBits(SelBits)) + return Op1; + + auto [DemandedLHS, DemandedRHS] = getPRMTDemandedBits(Selector, DemandedBits); + + // Attempt to avoid multi-use ops if we don't need anything from them. + SDValue DemandedOp0 = + TLI.SimplifyMultipleUseDemandedBits(Op0, DemandedLHS, DAG, Depth + 1); + SDValue DemandedOp1 = + TLI.SimplifyMultipleUseDemandedBits(Op1, DemandedRHS, DAG, Depth + 1); + + DemandedOp0 = canonicalizePRMTInput(DemandedOp0, DAG); + DemandedOp1 = canonicalizePRMTInput(DemandedOp1, DAG); + if ((DemandedOp0 && DemandedOp0 != Op0) || + (DemandedOp1 && DemandedOp1 != Op1)) { + Op0 = DemandedOp0 ? DemandedOp0 : Op0; + Op1 = DemandedOp1 ? DemandedOp1 : Op1; + return getPRMT(Op0, Op1, Selector.getZExtValue(), SDLoc(PRMT), DAG); + } + + return SDValue(); +} + +bool NVPTXTargetLowering::SimplifyDemandedBitsForTargetNode( + SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, + KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const { + Known.resetAll(); + + switch (Op.getOpcode()) { + case NVPTXISD::PRMT: + if (SDValue Result = simplifyDemandedBitsForPRMT(Op, DemandedBits, TLO.DAG, + *this, Depth)) { + TLO.CombineTo(Op, Result); + return true; + } + break; + default: + break; + } + + computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth); + return false; +} diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index bc3548c..228e2aa 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -275,6 +275,11 @@ public: const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth = 0) const override; + bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, + const APInt &DemandedElts, + KnownBits &Known, + TargetLoweringOpt &TLO, + unsigned Depth = 0) const override; private: const NVPTXSubtarget &STI; // cache the subtarget here diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td index 1ac91fa..80fac18 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td @@ -53,34 +53,30 @@ let Predicates = [IsISAFuture] in { let Predicates = [HasVSX, IsISAFuture] in { let mayLoad = 1 in { - def LXVRL : XX1Form_memOp<31, 525, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB), - "lxvrl $XT, $RA, $RB", IIC_LdStLoad, []>; - - def LXVRLL : XX1Form_memOp<31, 557, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB), - "lxvrll $XT, $RA, $RB", IIC_LdStLoad, []>; - - def LXVPRL : XForm_XTp5_XAB5<31, 589, (outs vsrprc:$XTp), - (ins memr:$RA, g8rc:$RB), - "lxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>; - - def LXVPRLL : XForm_XTp5_XAB5<31, 621, (outs vsrprc:$XTp), - (ins memr:$RA, g8rc:$RB), - "lxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>; + def LXVRL + : XX1Form_memOp<31, 525, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB), + "lxvrl $XT, $RA, $RB", IIC_LdStLoad, []>; + def LXVRLL + : XX1Form_memOp<31, 557, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB), + "lxvrll $XT, $RA, $RB", IIC_LdStLoad, []>; + def LXVPRL + : XForm_XTp5_XAB5<31, 589, (outs vsrprc:$XTp), (ins memr:$RA, g8rc:$RB), + "lxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>; + def LXVPRLL + : XForm_XTp5_XAB5<31, 621, (outs vsrprc:$XTp), (ins memr:$RA, g8rc:$RB), + "lxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>; } let mayStore = 1 in { - def STXVRL : XX1Form_memOp<31, 653, (outs), - (ins vsrc:$XT, memr:$RA, g8rc:$RB), - "stxvrl $XT, $RA, $RB", IIC_LdStLoad, []>; - - def STXVRLL : XX1Form_memOp<31, 685, (outs), - (ins vsrc:$XT, memr:$RA, g8rc:$RB), - "stxvrll $XT, $RA, $RB", IIC_LdStLoad, []>; - + def STXVRL + : XX1Form_memOp<31, 653, (outs), (ins vsrc:$XT, memr:$RA, g8rc:$RB), + "stxvrl $XT, $RA, $RB", IIC_LdStLoad, []>; + def STXVRLL + : XX1Form_memOp<31, 685, (outs), (ins vsrc:$XT, memr:$RA, g8rc:$RB), + "stxvrll $XT, $RA, $RB", IIC_LdStLoad, []>; def STXVPRL : XForm_XTp5_XAB5<31, 717, (outs), (ins vsrprc:$XTp, memr:$RA, g8rc:$RB), "stxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>; - def STXVPRLL : XForm_XTp5_XAB5<31, 749, (outs), (ins vsrprc:$XTp, memr:$RA, g8rc:$RB), "stxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>; diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp index 75a0272..996b6ef 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -171,7 +171,7 @@ void PPCSubtarget::getCriticalPathRCs(RegClassVector &CriticalPathRCs) const { } void PPCSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, - unsigned NumRegionInstrs) const { + const SchedRegion &Region) const { // The GenericScheduler that we use defaults to scheduling bottom up only. // We want to schedule from both the top and the bottom and so we set // OnlyBottomUp to false. diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h index 9a97d1a..3c59a47 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -240,7 +240,8 @@ public: void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override; void overrideSchedPolicy(MachineSchedPolicy &Policy, - unsigned NumRegionInstrs) const override; + const SchedRegion &Region) const override; + bool useAA() const override; bool enableSubRegLiveness() const override; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp index aeda5ac..5abb546 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp @@ -52,15 +52,6 @@ namespace RISCV { #include "RISCVGenSearchableTables.inc" } // namespace RISCV -// Report an error but don't ask the user to report a bug. -// TODO: Remove these wrappers. -[[noreturn]] static void reportError(const char *Reason) { - reportFatalUsageError(Reason); -} -[[noreturn]] static void reportError(Error Err) { - reportFatalUsageError(std::move(Err)); -} - namespace RISCVABI { ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits, StringRef ABIName) { @@ -97,7 +88,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits, if ((TargetABI == RISCVABI::ABI::ABI_ILP32E || (TargetABI == ABI_Unknown && IsRVE && !IsRV64)) && FeatureBits[RISCV::FeatureStdExtD]) - reportError("ILP32E cannot be used with the D ISA extension"); + reportFatalUsageError("ILP32E cannot be used with the D ISA extension"); if (TargetABI != ABI_Unknown) return TargetABI; @@ -105,7 +96,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits, // If no explicit ABI is given, try to compute the default ABI. auto ISAInfo = RISCVFeatures::parseFeatureBits(IsRV64, FeatureBits); if (!ISAInfo) - reportError(ISAInfo.takeError()); + reportFatalUsageError(ISAInfo.takeError()); return getTargetABI((*ISAInfo)->computeDefaultABI()); } @@ -137,12 +128,12 @@ namespace RISCVFeatures { void validate(const Triple &TT, const FeatureBitset &FeatureBits) { if (TT.isArch64Bit() && !FeatureBits[RISCV::Feature64Bit]) - reportError("RV64 target requires an RV64 CPU"); + reportFatalUsageError("RV64 target requires an RV64 CPU"); if (!TT.isArch64Bit() && !FeatureBits[RISCV::Feature32Bit]) - reportError("RV32 target requires an RV32 CPU"); + reportFatalUsageError("RV32 target requires an RV32 CPU"); if (FeatureBits[RISCV::Feature32Bit] && FeatureBits[RISCV::Feature64Bit]) - reportError("RV32 and RV64 can't be combined"); + reportFatalUsageError("RV32 and RV64 can't be combined"); } llvm::Expected<std::unique_ptr<RISCVISAInfo>> diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp index baa508a..269b117 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp @@ -13,13 +13,7 @@ #include "MCTargetDesc/RISCVAsmBackend.h" #include "MCTargetDesc/RISCVMCAsmInfo.h" -#include "RISCVFixupKinds.h" #include "llvm/BinaryFormat/ELF.h" -#include "llvm/MC/MCAssembler.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCValue.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" using namespace llvm; diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.td b/llvm/lib/Target/RISCV/RISCVCallingConv.td index cbf039e..4c303a9 100644 --- a/llvm/lib/Target/RISCV/RISCVCallingConv.td +++ b/llvm/lib/Target/RISCV/RISCVCallingConv.td @@ -56,19 +56,21 @@ def CSR_XLEN_F32_Interrupt: CalleeSavedRegs<(add CSR_Interrupt, def CSR_XLEN_F64_Interrupt: CalleeSavedRegs<(add CSR_Interrupt, (sequence "F%u_D", 0, 31))>; +defvar VREGS = (add (sequence "V%u", 0, 31), + (sequence "V%uM2", 0, 31, 2), + (sequence "V%uM4", 0, 31, 4), + (sequence "V%uM8", 0, 31, 8)); + // Same as CSR_Interrupt, but including all vector registers. -def CSR_XLEN_V_Interrupt: CalleeSavedRegs<(add CSR_Interrupt, - (sequence "V%u", 0, 31))>; +def CSR_XLEN_V_Interrupt: CalleeSavedRegs<(add CSR_Interrupt, VREGS)>; // Same as CSR_Interrupt, but including all 32-bit FP registers and all vector // registers. -def CSR_XLEN_F32_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F32_Interrupt, - (sequence "V%u", 0, 31))>; +def CSR_XLEN_F32_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F32_Interrupt, VREGS)>; // Same as CSR_Interrupt, but including all 64-bit FP registers and all vector // registers. -def CSR_XLEN_F64_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F64_Interrupt, - (sequence "V%u", 0, 31))>; +def CSR_XLEN_F64_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F64_Interrupt, VREGS)>; // Same as CSR_Interrupt, but excluding X16-X31. def CSR_Interrupt_RVE : CalleeSavedRegs<(sub CSR_Interrupt, diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index f9c0b54..171940e 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1272,7 +1272,7 @@ def FeatureVendorXSfmm128t def FeatureVendorXSfvqmaccdod : RISCVExtension<1, 0, "SiFive Int8 Matrix Multiplication Instructions (2-by-8 and 8-by-2)", - [FeatureStdExtZve32x]>; + [FeatureStdExtZve32x, FeatureStdExtZvl128b]>; def HasVendorXSfvqmaccdod : Predicate<"Subtarget->hasVendorXSfvqmaccdod()">, AssemblerPredicate<(all_of FeatureVendorXSfvqmaccdod), @@ -1281,7 +1281,7 @@ def HasVendorXSfvqmaccdod def FeatureVendorXSfvqmaccqoq : RISCVExtension<1, 0, "SiFive Int8 Matrix Multiplication Instructions (4-by-8 and 8-by-4)", - [FeatureStdExtZve32x]>; + [FeatureStdExtZve32x, FeatureStdExtZvl256b]>; def HasVendorXSfvqmaccqoq : Predicate<"Subtarget->hasVendorXSfvqmaccqoq()">, AssemblerPredicate<(all_of FeatureVendorXSfvqmaccqoq), @@ -1290,7 +1290,7 @@ def HasVendorXSfvqmaccqoq def FeatureVendorXSfvfwmaccqqq : RISCVExtension<1, 0, "SiFive Matrix Multiply Accumulate Instruction (4-by-4)", - [FeatureStdExtZvfbfmin]>; + [FeatureStdExtZvfbfmin, FeatureStdExtZvl128b]>; def HasVendorXSfvfwmaccqqq : Predicate<"Subtarget->hasVendorXSfvfwmaccqqq()">, AssemblerPredicate<(all_of FeatureVendorXSfvfwmaccqqq), diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 23b4554..b1ab76a 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -1544,10 +1544,53 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, return Offset; } +static MCRegister getRVVBaseRegister(const RISCVRegisterInfo &TRI, + const Register &Reg) { + MCRegister BaseReg = TRI.getSubReg(Reg, RISCV::sub_vrm1_0); + // If it's not a grouped vector register, it doesn't have subregister, so + // the base register is just itself. + if (BaseReg == RISCV::NoRegister) + BaseReg = Reg; + return BaseReg; +} + void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const { TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + + // In TargetFrameLowering::determineCalleeSaves, any vector register is marked + // as saved if any of its subregister is clobbered, this is not correct in + // vector registers. We only want the vector register to be marked as saved + // if all of its subregisters are clobbered. + // For example: + // Original behavior: If v24 is marked, v24m2, v24m4, v24m8 are also marked. + // Correct behavior: v24m2 is marked only if v24 and v25 are marked. + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); + const RISCVRegisterInfo &TRI = *STI.getRegisterInfo(); + for (unsigned i = 0; CSRegs[i]; ++i) { + unsigned CSReg = CSRegs[i]; + // Only vector registers need special care. + if (!RISCV::VRRegClass.contains(getRVVBaseRegister(TRI, CSReg))) + continue; + + SavedRegs.reset(CSReg); + + auto SubRegs = TRI.subregs(CSReg); + // Set the register and all its subregisters. + if (!MRI.def_empty(CSReg) || MRI.getUsedPhysRegsMask().test(CSReg)) { + SavedRegs.set(CSReg); + llvm::for_each(SubRegs, [&](unsigned Reg) { return SavedRegs.set(Reg); }); + } + + // Combine to super register if all of its subregisters are marked. + if (!SubRegs.empty() && llvm::all_of(SubRegs, [&](unsigned Reg) { + return SavedRegs.test(Reg); + })) + SavedRegs.set(CSReg); + } + // Unconditionally spill RA and FP only if the function uses a frame // pointer. if (hasFP(MF)) { @@ -2137,16 +2180,6 @@ static unsigned getCalleeSavedRVVNumRegs(const Register &BaseReg) { : 8; } -static MCRegister getRVVBaseRegister(const RISCVRegisterInfo &TRI, - const Register &Reg) { - MCRegister BaseReg = TRI.getSubReg(Reg, RISCV::sub_vrm1_0); - // If it's not a grouped vector register, it doesn't have subregister, so - // the base register is just itself. - if (BaseReg == RISCV::NoRegister) - BaseReg = Reg; - return BaseReg; -} - void RISCVFrameLowering::emitCalleeSavedRVVPrologCFI( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, bool HasFP) const { MachineFunction *MF = MBB.getParent(); diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index a541c2f..34910b7 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -3032,6 +3032,63 @@ bool RISCVDAGToDAGISel::SelectAddrRegImmLsb00000(SDValue Addr, SDValue &Base, return true; } +/// Return true if this a load/store that we have a RegRegScale instruction for. +static bool isRegRegScaleLoadOrStore(SDNode *User, SDValue Add, + const RISCVSubtarget &Subtarget) { + if (User->getOpcode() != ISD::LOAD && User->getOpcode() != ISD::STORE) + return false; + EVT VT = cast<MemSDNode>(User)->getMemoryVT(); + if (!(VT.isScalarInteger() && + (Subtarget.hasVendorXTHeadMemIdx() || Subtarget.hasVendorXqcisls())) && + !((VT == MVT::f32 || VT == MVT::f64) && + Subtarget.hasVendorXTHeadFMemIdx())) + return false; + // Don't allow stores of the value. It must be used as the address. + if (User->getOpcode() == ISD::STORE && + cast<StoreSDNode>(User)->getValue() == Add) + return false; + + return true; +} + +/// Is it profitable to fold this Add into RegRegScale load/store. If \p +/// Shift is non-null, then we have matched a shl+add. We allow reassociating +/// (add (add (shl A C2) B) C1) -> (add (add B C1) (shl A C2)) if there is a +/// single addi and we don't have a SHXADD instruction we could use. +/// FIXME: May still need to check how many and what kind of users the SHL has. +static bool isWorthFoldingIntoRegRegScale(const RISCVSubtarget &Subtarget, + SDValue Add, + SDValue Shift = SDValue()) { + bool FoundADDI = false; + for (auto *User : Add->users()) { + if (isRegRegScaleLoadOrStore(User, Add, Subtarget)) + continue; + + // Allow a single ADDI that is used by loads/stores if we matched a shift. + if (!Shift || FoundADDI || User->getOpcode() != ISD::ADD || + !isa<ConstantSDNode>(User->getOperand(1)) || + !isInt<12>(cast<ConstantSDNode>(User->getOperand(1))->getSExtValue())) + return false; + + FoundADDI = true; + + // If we have a SHXADD instruction, prefer that over reassociating an ADDI. + assert(Shift.getOpcode() == ISD::SHL); + unsigned ShiftAmt = Shift.getConstantOperandVal(1); + if ((ShiftAmt <= 3 && + (Subtarget.hasStdExtZba() || Subtarget.hasVendorXTHeadBa())) || + (ShiftAmt >= 4 && ShiftAmt <= 7 && Subtarget.hasVendorXqciac())) + return false; + + // All users of the ADDI should be load/store. + for (auto *ADDIUser : User->users()) + if (!isRegRegScaleLoadOrStore(ADDIUser, SDValue(User, 0), Subtarget)) + return false; + } + + return true; +} + bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr, unsigned MaxShiftAmount, SDValue &Base, SDValue &Index, @@ -3062,7 +3119,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr, if (LHS.getOpcode() == ISD::ADD && !isa<ConstantSDNode>(LHS.getOperand(1)) && isInt<12>(C1->getSExtValue())) { - if (SelectShl(LHS.getOperand(1), Index, Scale)) { + if (SelectShl(LHS.getOperand(1), Index, Scale) && + isWorthFoldingIntoRegRegScale(*Subtarget, LHS, LHS.getOperand(1))) { SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(), SDLoc(Addr), VT); Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT, @@ -3072,7 +3130,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr, } // Add is commutative so we need to check both operands. - if (SelectShl(LHS.getOperand(0), Index, Scale)) { + if (SelectShl(LHS.getOperand(0), Index, Scale) && + isWorthFoldingIntoRegRegScale(*Subtarget, LHS, LHS.getOperand(0))) { SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(), SDLoc(Addr), VT); Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT, @@ -3090,16 +3149,23 @@ bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr, // Try to match a shift on the RHS. if (SelectShl(RHS, Index, Scale)) { + if (!isWorthFoldingIntoRegRegScale(*Subtarget, Addr, RHS)) + return false; Base = LHS; return true; } // Try to match a shift on the LHS. if (SelectShl(LHS, Index, Scale)) { + if (!isWorthFoldingIntoRegRegScale(*Subtarget, Addr, LHS)) + return false; Base = RHS; return true; } + if (!isWorthFoldingIntoRegRegScale(*Subtarget, Addr)) + return false; + Base = LHS; Index = RHS; Scale = CurDAG->getTargetConstant(0, SDLoc(Addr), VT); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 4845a9c..3918dd2 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2319,6 +2319,10 @@ bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, if (getLegalZfaFPImm(Imm, VT) >= 0) return true; + // Some constants can be produced by fli+fneg. + if (Imm.isNegative() && getLegalZfaFPImm(-Imm, VT) >= 0) + return true; + // Cannot create a 64 bit floating-point immediate value for rv32. if (Subtarget.getXLen() < VT.getScalarSizeInBits()) { // td can handle +0.0 or -0.0 already. @@ -7936,7 +7940,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, BasePtr, MachinePointerInfo(Load->getAddressSpace()), Align(8)); OutChains.push_back(LoadVal.getValue(1)); Ret = DAG.getNode(RISCVISD::TUPLE_INSERT, DL, VT, Ret, LoadVal, - DAG.getVectorIdxConstant(i, DL)); + DAG.getTargetConstant(i, DL, MVT::i32)); BasePtr = DAG.getNode(ISD::ADD, DL, XLenVT, BasePtr, VROffset, Flag); } return DAG.getMergeValues( @@ -8015,9 +8019,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, // Extract subregisters in a vector tuple and store them individually. for (unsigned i = 0; i < NF; ++i) { - auto Extract = DAG.getNode(RISCVISD::TUPLE_EXTRACT, DL, - MVT::getScalableVectorVT(MVT::i8, NumElts), - StoredVal, DAG.getVectorIdxConstant(i, DL)); + auto Extract = + DAG.getNode(RISCVISD::TUPLE_EXTRACT, DL, + MVT::getScalableVectorVT(MVT::i8, NumElts), StoredVal, + DAG.getTargetConstant(i, DL, MVT::i32)); Ret = DAG.getStore(Chain, DL, Extract, BasePtr, MachinePointerInfo(Store->getAddressSpace()), Store->getBaseAlign(), @@ -10934,9 +10939,9 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Load->getMemoryVT(), Load->getMemOperand()); SmallVector<SDValue, 9> Results; for (unsigned int RetIdx = 0; RetIdx < NF; RetIdx++) { - SDValue SubVec = - DAG.getNode(RISCVISD::TUPLE_EXTRACT, DL, ContainerVT, - Result.getValue(0), DAG.getVectorIdxConstant(RetIdx, DL)); + SDValue SubVec = DAG.getNode(RISCVISD::TUPLE_EXTRACT, DL, ContainerVT, + Result.getValue(0), + DAG.getTargetConstant(RetIdx, DL, MVT::i32)); Results.push_back(convertFromScalableVector(VT, SubVec, DAG, Subtarget)); } Results.push_back(Result.getValue(1)); @@ -11023,7 +11028,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op, RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal, convertToScalableVector( ContainerVT, FixedIntrinsic->getOperand(2 + i), DAG, Subtarget), - DAG.getVectorIdxConstant(i, DL)); + DAG.getTargetConstant(i, DL, MVT::i32)); SDValue Ops[] = { FixedIntrinsic->getChain(), @@ -12027,7 +12032,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op, for (unsigned i = 0U; i < Factor; ++i) Res[i] = DAG.getNode(RISCVISD::TUPLE_EXTRACT, DL, VecVT, Load, - DAG.getVectorIdxConstant(i, DL)); + DAG.getTargetConstant(i, DL, MVT::i32)); return DAG.getMergeValues(Res, DL); } @@ -12124,8 +12129,9 @@ SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op, SDValue StoredVal = DAG.getUNDEF(VecTupTy); for (unsigned i = 0; i < Factor; i++) - StoredVal = DAG.getNode(RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal, - Op.getOperand(i), DAG.getConstant(i, DL, XLenVT)); + StoredVal = + DAG.getNode(RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal, + Op.getOperand(i), DAG.getTargetConstant(i, DL, MVT::i32)); SDValue Ops[] = {DAG.getEntryNode(), DAG.getTargetConstant(IntrIds[Factor - 2], DL, XLenVT), @@ -20690,7 +20696,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, SDValue Result = DAG.getUNDEF(VT); for (unsigned i = 0; i < NF; ++i) Result = DAG.getNode(RISCVISD::TUPLE_INSERT, DL, VT, Result, Splat, - DAG.getVectorIdxConstant(i, DL)); + DAG.getTargetConstant(i, DL, MVT::i32)); return Result; } // If this is a bitcast between a MVT::v4i1/v2i1/v1i1 and an illegal integer @@ -24014,7 +24020,7 @@ bool RISCVTargetLowering::splitValueIntoRegisterParts( #endif Val = DAG.getNode(RISCVISD::TUPLE_INSERT, DL, PartVT, DAG.getUNDEF(PartVT), - Val, DAG.getVectorIdxConstant(0, DL)); + Val, DAG.getTargetConstant(0, DL, MVT::i32)); Parts[0] = Val; return true; } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index e0a8c07..f0447e0 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -434,7 +434,8 @@ public: ArrayRef<unsigned> Indices, unsigned Factor) const override; - bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, + bool lowerInterleavedStore(Instruction *Store, Value *Mask, + ShuffleVectorInst *SVI, unsigned Factor) const override; bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, @@ -444,9 +445,6 @@ public: Instruction *Store, Value *Mask, ArrayRef<Value *> InterleaveValues) const override; - bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask, - ArrayRef<Value *> InterleaveOps) const override; - bool supportKCFIBundles() const override { return true; } SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index aef410f..dd365cf 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -44,67 +44,86 @@ def simm10_unsigned : RISCVOp { //===----------------------------------------------------------------------===// let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class RVPUnaryImm10<bits<7> funct7, string opcodestr, - DAGOperand TyImm10 = simm10> - : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), (ins TyImm10:$imm10), - opcodestr, "$rd, $imm10"> { +class PLI_i<bits<7> funct7, string opcodestr> + : RVInst<(outs GPR:$rd), (ins simm10:$imm10), opcodestr, "$rd, $imm10", [], + InstFormatOther> { bits<10> imm10; + bits<5> rd; let Inst{31-25} = funct7; let Inst{24-16} = imm10{8-0}; let Inst{15} = imm10{9}; + let Inst{14-12} = 0b010; + let Inst{11-7} = rd; + let Inst{6-0} = OPC_OP_IMM_32.Value; } let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class RVPUnaryImm8<bits<8> funct8, string opcodestr> - : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), (ins uimm8:$uimm8), - opcodestr, "$rd, $uimm8"> { +class PLUI_i<bits<7> funct7, string opcodestr> + : RVInst<(outs GPR:$rd), (ins simm10_unsigned:$imm10), opcodestr, + "$rd, $imm10", [], InstFormatOther> { + bits<10> imm10; + bits<5> rd; + + let Inst{31-25} = funct7; + let Inst{24} = imm10{0}; + let Inst{23-15} = imm10{9-1}; + let Inst{14-12} = 0b010; + let Inst{11-7} = rd; + let Inst{6-0} = OPC_OP_IMM_32.Value; +} + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +class PLI_B_i<bits<8> funct8, string opcodestr> + : RVInst<(outs GPR:$rd), (ins uimm8:$uimm8), opcodestr, "$rd, $uimm8", [], + InstFormatOther> { bits<8> uimm8; + bits<5> rd; let Inst{31-24} = funct8; let Inst{23-16} = uimm8; let Inst{15} = 0b0; + let Inst{14-12} = 0b010; + let Inst{11-7} = rd; + let Inst{6-0} = OPC_OP_IMM_32.Value; } let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class RVPUnary<bits<3> f, string opcodestr, dag operands, string argstr> - : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), operands, opcodestr, argstr> { - bits<5> imm; - bits<5> rs1; - +class RVPShift_ri<bits<3> f, bits<3> funct3, string opcodestr, Operand ImmType> + : RVInstIBase<funct3, OPC_OP_IMM_32, (outs GPR:$rd), + (ins GPR:$rs1, ImmType:$shamt), opcodestr, + "$rd, $rs1, $shamt"> { let Inst{31} = 0b1; let Inst{30-28} = f; let Inst{27} = 0b0; - let Inst{19-15} = rs1; } -class RVPUnaryImm5<bits<3> f, string opcodestr> - : RVPUnary<f, opcodestr, (ins GPR:$rs1, uimm5:$uimm5), "$rd, $rs1, $uimm5"> { - bits<5> uimm5; +class RVPShiftW_ri<bits<3> f, bits<3> funct3, string opcodestr> + : RVPShift_ri<f, funct3, opcodestr, uimm5> { + bits<5> shamt; - let imm = uimm5; let Inst{26-25} = 0b01; - let Inst{24-20} = uimm5; + let Inst{24-20} = shamt; } -class RVPUnaryImm4<bits<3> f, string opcodestr> - : RVPUnary<f, opcodestr, (ins GPR:$rs1, uimm4:$uimm4), "$rd, $rs1, $uimm4"> { - bits<4> uimm4; +class RVPShiftH_ri<bits<3> f, bits<3> funct3, string opcodestr> + : RVPShift_ri<f, funct3, opcodestr, uimm4> { + bits<4> shamt; let Inst{26-24} = 0b001; - let Inst{23-20} = uimm4; + let Inst{23-20} = shamt; } -class RVPUnaryImm3<bits<3> f, string opcodestr> - : RVPUnary<f, opcodestr, (ins GPR:$rs1, uimm3:$uimm3), "$rd, $rs1, $uimm3"> { - bits<3> uimm3; +class RVPShiftB_ri<bits<3> f, bits<3> funct3, string opcodestr> + : RVPShift_ri<f, funct3, opcodestr, uimm3> { + bits<3> shamt; let Inst{26-23} = 0b0001; - let Inst{22-20} = uimm3; + let Inst{22-20} = shamt; } let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class RVPUnaryWUF<bits<2> w, bits<5> uf, string opcodestr> +class RVPUnary_ri<bits<2> w, bits<5> uf, string opcodestr> : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), (ins GPR:$rs1), opcodestr, "$rd, $rs1"> { let Inst{31-27} = 0b11100; @@ -132,36 +151,36 @@ def ABSW : UnaryW_r<0b011000000111, 0b001, "absw">; } // Predicates = [HasStdExtP, IsRV64] let Predicates = [HasStdExtP] in { -def PSLLI_B : RVPUnaryImm3<0b000, "pslli.b">; -def PSLLI_H : RVPUnaryImm4<0b000, "pslli.h">; -def PSSLAI_H : RVPUnaryImm4<0b101, "psslai.h">; +def PSLLI_B : RVPShiftB_ri<0b000, 0b010, "pslli.b">; +def PSLLI_H : RVPShiftH_ri<0b000, 0b010, "pslli.h">; +def PSSLAI_H : RVPShiftH_ri<0b101, 0b010, "psslai.h">; } // Predicates = [HasStdExtP] let DecoderNamespace = "RV32Only", Predicates = [HasStdExtP, IsRV32] in -def SSLAI : RVPUnaryImm5<0b101, "sslai">; +def SSLAI : RVPShiftW_ri<0b101, 0b010, "sslai">; let Predicates = [HasStdExtP, IsRV64] in { -def PSLLI_W : RVPUnaryImm5<0b000, "pslli.w">; -def PSSLAI_W : RVPUnaryImm5<0b101, "psslai.w">; +def PSLLI_W : RVPShiftW_ri<0b000, 0b010, "pslli.w">; +def PSSLAI_W : RVPShiftW_ri<0b101, 0b010, "psslai.w">; } // Predicates = [HasStdExtP, IsRV64] let Predicates = [HasStdExtP] in -def PLI_H : RVPUnaryImm10<0b1011000, "pli.h">; +def PLI_H : PLI_i<0b1011000, "pli.h">; let Predicates = [HasStdExtP, IsRV64] in -def PLI_W : RVPUnaryImm10<0b1011001, "pli.w">; +def PLI_W : PLI_i<0b1011001, "pli.w">; let Predicates = [HasStdExtP] in -def PLI_B : RVPUnaryImm8<0b10110100, "pli.b">; +def PLI_B : PLI_B_i<0b10110100, "pli.b">; let Predicates = [HasStdExtP] in { -def PSEXT_H_B : RVPUnaryWUF<0b00, 0b00100, "psext.h.b">; -def PSABS_H : RVPUnaryWUF<0b00, 0b00111, "psabs.h">; -def PSABS_B : RVPUnaryWUF<0b10, 0b00111, "psabs.b">; +def PSEXT_H_B : RVPUnary_ri<0b00, 0b00100, "psext.h.b">; +def PSABS_H : RVPUnary_ri<0b00, 0b00111, "psabs.h">; +def PSABS_B : RVPUnary_ri<0b10, 0b00111, "psabs.b">; } // Predicates = [HasStdExtP] let Predicates = [HasStdExtP, IsRV64] in { -def PSEXT_W_B : RVPUnaryWUF<0b01, 0b00100, "psext.w.b">; -def PSEXT_W_H : RVPUnaryWUF<0b01, 0b00101, "psext.w.h">; +def PSEXT_W_B : RVPUnary_ri<0b01, 0b00100, "psext.w.b">; +def PSEXT_W_H : RVPUnary_ri<0b01, 0b00101, "psext.w.h">; } // Predicates = [HasStdExtP, IsRV64] let Predicates = [HasStdExtP] in -def PLUI_H : RVPUnaryImm10<0b1111000, "plui.h", simm10_unsigned>; +def PLUI_H : PLUI_i<0b1111000, "plui.h">; let Predicates = [HasStdExtP, IsRV64] in -def PLUI_W : RVPUnaryImm10<0b1111001, "plui.w", simm10_unsigned>; +def PLUI_W : PLUI_i<0b1111001, "plui.w">; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index de9e55b..dfa532a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -543,7 +543,8 @@ defset list<VTypeInfoToWide> AllWidenableBFloatToFloatVectors = { // This represents the information we need in codegen for each pseudo. // The definition should be consistent with `struct PseudoInfo` in // RISCVInstrInfo.h. -class RISCVVPseudo { +class RISCVVPseudo<dag outs, dag ins, list<dag> pattern = [], string opcodestr = "", string argstr = ""> + : Pseudo<outs, ins, pattern, opcodestr, argstr> { Pseudo Pseudo = !cast<Pseudo>(NAME); // Used as a key. Instruction BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst); // SEW = 0 is used to denote that the Pseudo is not SEW specific (or unknown). @@ -785,10 +786,9 @@ class GetVTypeMinimalPredicates<VTypeInfo vti> { class VPseudoUSLoadNoMask<VReg RetClass, int EEW, DAGOperand sewop = sew> : - Pseudo<(outs RetClass:$rd), - (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$vl, sewop:$sew, - vec_policy:$policy), []>, - RISCVVPseudo, + RISCVVPseudo<(outs RetClass:$rd), + (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$vl, + sewop:$sew, vec_policy:$policy), []>, RISCVVLE</*Masked*/0, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -801,11 +801,10 @@ class VPseudoUSLoadNoMask<VReg RetClass, class VPseudoUSLoadMask<VReg RetClass, int EEW> : - Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd), - (ins GetVRegNoV0<RetClass>.R:$passthru, - GPRMemZeroOffset:$rs1, - VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo, + RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), + (ins GetVRegNoV0<RetClass>.R:$passthru, + GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$vl, sew:$sew, + vec_policy:$policy), []>, RISCVVLE</*Masked*/1, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -820,10 +819,9 @@ class VPseudoUSLoadMask<VReg RetClass, class VPseudoUSLoadFFNoMask<VReg RetClass, int EEW> : - Pseudo<(outs RetClass:$rd, GPR:$vl), - (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$avl, - sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo, + RISCVVPseudo<(outs RetClass:$rd, GPR:$vl), + (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$avl, + sew:$sew, vec_policy:$policy), []>, RISCVVLE</*Masked*/0, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -836,11 +834,10 @@ class VPseudoUSLoadFFNoMask<VReg RetClass, class VPseudoUSLoadFFMask<VReg RetClass, int EEW> : - Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl), - (ins GetVRegNoV0<RetClass>.R:$passthru, - GPRMemZeroOffset:$rs1, - VMaskOp:$vm, AVL:$avl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo, + RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl), + (ins GetVRegNoV0<RetClass>.R:$passthru, + GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$avl, sew:$sew, + vec_policy:$policy), []>, RISCVVLE</*Masked*/1, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -855,10 +852,9 @@ class VPseudoUSLoadFFMask<VReg RetClass, class VPseudoSLoadNoMask<VReg RetClass, int EEW> : - Pseudo<(outs RetClass:$rd), - (ins RetClass:$dest, GPRMemZeroOffset:$rs1, GPR:$rs2, AVL:$vl, - sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo, + RISCVVPseudo<(outs RetClass:$rd), + (ins RetClass:$dest, GPRMemZeroOffset:$rs1, GPR:$rs2, + AVL:$vl, sew:$sew, vec_policy:$policy), []>, RISCVVLE</*Masked*/0, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -871,11 +867,10 @@ class VPseudoSLoadNoMask<VReg RetClass, class VPseudoSLoadMask<VReg RetClass, int EEW> : - Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd), - (ins GetVRegNoV0<RetClass>.R:$passthru, - GPRMemZeroOffset:$rs1, GPR:$rs2, - VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo, + RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), + (ins GetVRegNoV0<RetClass>.R:$passthru, + GPRMemZeroOffset:$rs1, GPR:$rs2, VMaskOp:$vm, AVL:$vl, + sew:$sew, vec_policy:$policy), []>, RISCVVLE</*Masked*/1, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -895,10 +890,9 @@ class VPseudoILoadNoMask<VReg RetClass, bit Ordered, bit EarlyClobber, bits<2> TargetConstraintType = 1> : - Pseudo<(outs RetClass:$rd), - (ins RetClass:$dest, GPRMemZeroOffset:$rs1, IdxClass:$rs2, AVL:$vl, - sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo, + RISCVVPseudo<(outs RetClass:$rd), + (ins RetClass:$dest, GPRMemZeroOffset:$rs1, IdxClass:$rs2, + AVL:$vl, sew:$sew, vec_policy:$policy), []>, RISCVVLX</*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> { let mayLoad = 1; let mayStore = 0; @@ -917,11 +911,10 @@ class VPseudoILoadMask<VReg RetClass, bit Ordered, bit EarlyClobber, bits<2> TargetConstraintType = 1> : - Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd), - (ins GetVRegNoV0<RetClass>.R:$passthru, - GPRMemZeroOffset:$rs1, IdxClass:$rs2, - VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo, + RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), + (ins GetVRegNoV0<RetClass>.R:$passthru, + GPRMemZeroOffset:$rs1, IdxClass:$rs2, VMaskOp:$vm, + AVL:$vl, sew:$sew, vec_policy:$policy), []>, RISCVVLX</*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> { let mayLoad = 1; let mayStore = 0; @@ -938,9 +931,9 @@ class VPseudoILoadMask<VReg RetClass, class VPseudoUSStoreNoMask<VReg StClass, int EEW, DAGOperand sewop = sew> : - Pseudo<(outs), - (ins StClass:$rd, GPRMemZeroOffset:$rs1, AVL:$vl, sewop:$sew), []>, - RISCVVPseudo, + RISCVVPseudo<(outs), + (ins StClass:$rd, GPRMemZeroOffset:$rs1, AVL:$vl, + sewop:$sew), []>, RISCVVSE</*Masked*/0, /*Strided*/0, !logtwo(EEW), VLMul> { let mayLoad = 0; let mayStore = 1; @@ -951,10 +944,9 @@ class VPseudoUSStoreNoMask<VReg StClass, class VPseudoUSStoreMask<VReg StClass, int EEW> : - Pseudo<(outs), - (ins StClass:$rd, GPRMemZeroOffset:$rs1, - VMaskOp:$vm, AVL:$vl, sew:$sew), []>, - RISCVVPseudo, + RISCVVPseudo<(outs), + (ins StClass:$rd, GPRMemZeroOffset:$rs1, + VMaskOp:$vm, AVL:$vl, sew:$sew), []>, RISCVVSE</*Masked*/1, /*Strided*/0, !logtwo(EEW), VLMul> { let mayLoad = 0; let mayStore = 1; @@ -966,10 +958,9 @@ class VPseudoUSStoreMask<VReg StClass, class VPseudoSStoreNoMask<VReg StClass, int EEW> : - Pseudo<(outs), - (ins StClass:$rd, GPRMemZeroOffset:$rs1, GPR:$rs2, - AVL:$vl, sew:$sew), []>, - RISCVVPseudo, + RISCVVPseudo<(outs), + (ins StClass:$rd, GPRMemZeroOffset:$rs1, GPR:$rs2, + AVL:$vl, sew:$sew), []>, RISCVVSE</*Masked*/0, /*Strided*/1, !logtwo(EEW), VLMul> { let mayLoad = 0; let mayStore = 1; @@ -980,10 +971,9 @@ class VPseudoSStoreNoMask<VReg StClass, class VPseudoSStoreMask<VReg StClass, int EEW> : - Pseudo<(outs), - (ins StClass:$rd, GPRMemZeroOffset:$rs1, GPR:$rs2, - VMaskOp:$vm, AVL:$vl, sew:$sew), []>, - RISCVVPseudo, + RISCVVPseudo<(outs), + (ins StClass:$rd, GPRMemZeroOffset:$rs1, GPR:$rs2, + VMaskOp:$vm, AVL:$vl, sew:$sew), []>, RISCVVSE</*Masked*/1, /*Strided*/1, !logtwo(EEW), VLMul> { let mayLoad = 0; let mayStore = 1; @@ -994,10 +984,9 @@ class VPseudoSStoreMask<VReg StClass, } class VPseudoNullaryNoMask<VReg RegClass> : - Pseudo<(outs RegClass:$rd), - (ins RegClass:$passthru, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo { + RISCVVPseudo<(outs RegClass:$rd), + (ins RegClass:$passthru, + AVL:$vl, sew:$sew, vec_policy:$policy), []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1008,10 +997,10 @@ class VPseudoNullaryNoMask<VReg RegClass> : } class VPseudoNullaryMask<VReg RegClass> : - Pseudo<(outs GetVRegNoV0<RegClass>.R:$rd), - (ins GetVRegNoV0<RegClass>.R:$passthru, - VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo { + RISCVVPseudo<(outs GetVRegNoV0<RegClass>.R:$rd), + (ins GetVRegNoV0<RegClass>.R:$passthru, + VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), + []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1026,8 +1015,7 @@ class VPseudoNullaryMask<VReg RegClass> : // Nullary for pseudo instructions. They are expanded in // RISCVExpandPseudoInsts pass. class VPseudoNullaryPseudoM<string BaseInst> : - Pseudo<(outs VR:$rd), (ins AVL:$vl, sew_mask:$sew), []>, - RISCVVPseudo { + RISCVVPseudo<(outs VR:$rd), (ins AVL:$vl, sew_mask:$sew), []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1041,10 +1029,9 @@ class VPseudoUnaryNoMask<DAGOperand RetClass, DAGOperand OpClass, string Constraint = "", bits<2> TargetConstraintType = 1> : - Pseudo<(outs RetClass:$rd), - (ins RetClass:$passthru, OpClass:$rs2, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo { + RISCVVPseudo<(outs RetClass:$rd), + (ins RetClass:$passthru, OpClass:$rs2, + AVL:$vl, sew:$sew, vec_policy:$policy), []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1059,9 +1046,8 @@ class VPseudoUnaryNoMaskNoPolicy<DAGOperand RetClass, DAGOperand OpClass, string Constraint = "", bits<2> TargetConstraintType = 1> : - Pseudo<(outs RetClass:$rd), - (ins OpClass:$rs2, AVL:$vl, sew_mask:$sew), []>, - RISCVVPseudo { + RISCVVPseudo<(outs RetClass:$rd), + (ins OpClass:$rs2, AVL:$vl, sew_mask:$sew), []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1075,10 +1061,9 @@ class VPseudoUnaryNoMaskRoundingMode<DAGOperand RetClass, DAGOperand OpClass, string Constraint = "", bits<2> TargetConstraintType = 1> : - Pseudo<(outs RetClass:$rd), - (ins RetClass:$passthru, OpClass:$rs2, vec_rm:$rm, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo { + RISCVVPseudo<(outs RetClass:$rd), + (ins RetClass:$passthru, OpClass:$rs2, vec_rm:$rm, + AVL:$vl, sew:$sew, vec_policy:$policy), []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1097,10 +1082,9 @@ class VPseudoUnaryMask<VReg RetClass, string Constraint = "", bits<2> TargetConstraintType = 1, DAGOperand sewop = sew> : - Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd), - (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2, - VMaskOp:$vm, AVL:$vl, sewop:$sew, vec_policy:$policy), []>, - RISCVVPseudo { + RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), + (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2, + VMaskOp:$vm, AVL:$vl, sewop:$sew, vec_policy:$policy), []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1117,11 +1101,10 @@ class VPseudoUnaryMaskRoundingMode<VReg RetClass, VReg OpClass, string Constraint = "", bits<2> TargetConstraintType = 1> : - Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd), - (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2, - VMaskOp:$vm, vec_rm:$rm, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo { + RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), + (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2, + VMaskOp:$vm, vec_rm:$rm, + AVL:$vl, sew:$sew, vec_policy:$policy), []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1155,9 +1138,8 @@ class VPseudoUnaryMask_NoExcept<VReg RetClass, } class VPseudoUnaryNoMaskGPROut : - Pseudo<(outs GPR:$rd), - (ins VR:$rs2, AVL:$vl, sew_mask:$sew), []>, - RISCVVPseudo { + RISCVVPseudo<(outs GPR:$rd), + (ins VR:$rs2, AVL:$vl, sew_mask:$sew), []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1166,9 +1148,8 @@ class VPseudoUnaryNoMaskGPROut : } class VPseudoUnaryMaskGPROut : - Pseudo<(outs GPR:$rd), - (ins VR:$rs1, VMaskOp:$vm, AVL:$vl, sew_mask:$sew), []>, - RISCVVPseudo { + RISCVVPseudo<(outs GPR:$rd), + (ins VR:$rs1, VMaskOp:$vm, AVL:$vl, sew_mask:$sew), []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1180,10 +1161,9 @@ class VPseudoUnaryMaskGPROut : // Mask can be V0~V31 class VPseudoUnaryAnyMask<VReg RetClass, VReg Op1Class> : - Pseudo<(outs RetClass:$rd), - (ins RetClass:$passthru, Op1Class:$rs2, - VR:$vm, AVL:$vl, sew:$sew), []>, - RISCVVPseudo { + RISCVVPseudo<(outs RetClass:$rd), + (ins RetClass:$passthru, Op1Class:$rs2, + VR:$vm, AVL:$vl, sew:$sew), []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1198,9 +1178,9 @@ class VPseudoBinaryNoMask<VReg RetClass, string Constraint, bits<2> TargetConstraintType = 1, DAGOperand sewop = sew> : - Pseudo<(outs RetClass:$rd), - (ins Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, sewop:$sew), []>, - RISCVVPseudo { + RISCVVPseudo<(outs RetClass:$rd), + (ins Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, sewop:$sew), + []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1215,10 +1195,9 @@ class VPseudoBinaryNoMaskPolicy<VReg RetClass, DAGOperand Op2Class, string Constraint, bits<2> TargetConstraintType = 1> : - Pseudo<(outs RetClass:$rd), - (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, - sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo { + RISCVVPseudo<(outs RetClass:$rd), + (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, + AVL:$vl, sew:$sew, vec_policy:$policy), []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1235,10 +1214,10 @@ class VPseudoBinaryNoMaskRoundingMode<VReg RetClass, string Constraint, bit UsesVXRM_ = 1, bits<2> TargetConstraintType = 1> : - Pseudo<(outs RetClass:$rd), - (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, vec_rm:$rm, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo { + RISCVVPseudo<(outs RetClass:$rd), + (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, + vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy), + []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1258,12 +1237,11 @@ class VPseudoBinaryMaskPolicyRoundingMode<VReg RetClass, string Constraint, bit UsesVXRM_, bits<2> TargetConstraintType = 1> : - Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd), - (ins GetVRegNoV0<RetClass>.R:$passthru, - Op1Class:$rs2, Op2Class:$rs1, - VMaskOp:$vm, vec_rm:$rm, AVL:$vl, - sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo { + RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), + (ins GetVRegNoV0<RetClass>.R:$passthru, + Op1Class:$rs2, Op2Class:$rs1, + VMaskOp:$vm, vec_rm:$rm, AVL:$vl, + sew:$sew, vec_policy:$policy), []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1286,10 +1264,9 @@ class VPseudoTiedBinaryNoMask<VReg RetClass, DAGOperand Op2Class, string Constraint, bits<2> TargetConstraintType = 1> : - Pseudo<(outs RetClass:$rd), - (ins RetClass:$rs2, Op2Class:$rs1, AVL:$vl, sew:$sew, - vec_policy:$policy), []>, - RISCVVPseudo { + RISCVVPseudo<(outs RetClass:$rd), + (ins RetClass:$rs2, Op2Class:$rs1, AVL:$vl, sew:$sew, + vec_policy:$policy), []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1307,12 +1284,11 @@ class VPseudoTiedBinaryNoMaskRoundingMode<VReg RetClass, DAGOperand Op2Class, string Constraint, bits<2> TargetConstraintType = 1> : - Pseudo<(outs RetClass:$rd), - (ins RetClass:$rs2, Op2Class:$rs1, - vec_rm:$rm, - AVL:$vl, sew:$sew, - vec_policy:$policy), []>, - RISCVVPseudo { + RISCVVPseudo<(outs RetClass:$rd), + (ins RetClass:$rs2, Op2Class:$rs1, + vec_rm:$rm, + AVL:$vl, sew:$sew, + vec_policy:$policy), []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1331,10 +1307,9 @@ class VPseudoTiedBinaryNoMaskRoundingMode<VReg RetClass, class VPseudoIStoreNoMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL, bit Ordered>: - Pseudo<(outs), - (ins StClass:$rd, GPRMemZeroOffset:$rs1, IdxClass:$rs2, AVL:$vl, - sew:$sew),[]>, - RISCVVPseudo, + RISCVVPseudo<(outs), + (ins StClass:$rd, GPRMemZeroOffset:$rs1, IdxClass:$rs2, + AVL:$vl, sew:$sew),[]>, RISCVVSX</*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> { let mayLoad = 0; let mayStore = 1; @@ -1345,10 +1320,9 @@ class VPseudoIStoreNoMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL, class VPseudoIStoreMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL, bit Ordered>: - Pseudo<(outs), - (ins StClass:$rd, GPRMemZeroOffset:$rs1, IdxClass:$rs2, - VMaskOp:$vm, AVL:$vl, sew:$sew),[]>, - RISCVVPseudo, + RISCVVPseudo<(outs), + (ins StClass:$rd, GPRMemZeroOffset:$rs1, IdxClass:$rs2, + VMaskOp:$vm, AVL:$vl, sew:$sew),[]>, RISCVVSX</*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> { let mayLoad = 0; let mayStore = 1; @@ -1363,11 +1337,11 @@ class VPseudoBinaryMaskPolicy<VReg RetClass, DAGOperand Op2Class, string Constraint, bits<2> TargetConstraintType = 1> : - Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd), - (ins GetVRegNoV0<RetClass>.R:$passthru, - Op1Class:$rs2, Op2Class:$rs1, - VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo { + RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), + (ins GetVRegNoV0<RetClass>.R:$passthru, + Op1Class:$rs2, Op2Class:$rs1, + VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), + []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1383,11 +1357,11 @@ class VPseudoBinaryMaskPolicy<VReg RetClass, class VPseudoTernaryMaskPolicy<VReg RetClass, RegisterClass Op1Class, DAGOperand Op2Class> : - Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd), - (ins GetVRegNoV0<RetClass>.R:$passthru, - Op1Class:$rs2, Op2Class:$rs1, - VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo { + RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), + (ins GetVRegNoV0<RetClass>.R:$passthru, + Op1Class:$rs2, Op2Class:$rs1, + VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), + []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1401,13 +1375,12 @@ class VPseudoTernaryMaskPolicy<VReg RetClass, class VPseudoTernaryMaskPolicyRoundingMode<VReg RetClass, RegisterClass Op1Class, DAGOperand Op2Class> : - Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd), - (ins GetVRegNoV0<RetClass>.R:$passthru, - Op1Class:$rs2, Op2Class:$rs1, - VMaskOp:$vm, - vec_rm:$rm, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo { + RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), + (ins GetVRegNoV0<RetClass>.R:$passthru, + Op1Class:$rs2, Op2Class:$rs1, + VMaskOp:$vm, + vec_rm:$rm, + AVL:$vl, sew:$sew, vec_policy:$policy), []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1427,11 +1400,11 @@ class VPseudoBinaryMOutMask<VReg RetClass, DAGOperand Op2Class, string Constraint, bits<2> TargetConstraintType = 1> : - Pseudo<(outs RetClass:$rd), - (ins RetClass:$passthru, - Op1Class:$rs2, Op2Class:$rs1, - VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo { + RISCVVPseudo<(outs RetClass:$rd), + (ins RetClass:$passthru, + Op1Class:$rs2, Op2Class:$rs1, + VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), + []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1451,11 +1424,11 @@ class VPseudoTiedBinaryMask<VReg RetClass, DAGOperand Op2Class, string Constraint, bits<2> TargetConstraintType = 1> : - Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd), - (ins GetVRegNoV0<RetClass>.R:$passthru, - Op2Class:$rs1, - VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo { + RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), + (ins GetVRegNoV0<RetClass>.R:$passthru, + Op2Class:$rs1, + VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), + []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1473,13 +1446,12 @@ class VPseudoTiedBinaryMaskRoundingMode<VReg RetClass, DAGOperand Op2Class, string Constraint, bits<2> TargetConstraintType = 1> : - Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd), - (ins GetVRegNoV0<RetClass>.R:$passthru, - Op2Class:$rs1, - VMaskOp:$vm, - vec_rm:$rm, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo { + RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), + (ins GetVRegNoV0<RetClass>.R:$passthru, + Op2Class:$rs1, + VMaskOp:$vm, + vec_rm:$rm, + AVL:$vl, sew:$sew, vec_policy:$policy), []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1503,13 +1475,12 @@ class VPseudoBinaryCarry<VReg RetClass, bit CarryIn, string Constraint, bits<2> TargetConstraintType = 1> : - Pseudo<(outs RetClass:$rd), - !if(CarryIn, - (ins Op1Class:$rs2, Op2Class:$rs1, - VMV0:$carry, AVL:$vl, sew:$sew), - (ins Op1Class:$rs2, Op2Class:$rs1, - AVL:$vl, sew:$sew)), []>, - RISCVVPseudo { + RISCVVPseudo<(outs RetClass:$rd), + !if(CarryIn, + (ins Op1Class:$rs2, Op2Class:$rs1, + VMV0:$carry, AVL:$vl, sew:$sew), + (ins Op1Class:$rs2, Op2Class:$rs1, + AVL:$vl, sew:$sew)), []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1525,10 +1496,9 @@ class VPseudoTiedBinaryCarryIn<VReg RetClass, DAGOperand Op2Class, LMULInfo MInfo, bits<2> TargetConstraintType = 1> : - Pseudo<(outs RetClass:$rd), - (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, - VMV0:$carry, AVL:$vl, sew:$sew), []>, - RISCVVPseudo { + RISCVVPseudo<(outs RetClass:$rd), + (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, + VMV0:$carry, AVL:$vl, sew:$sew), []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1544,10 +1514,9 @@ class VPseudoTernaryNoMask<VReg RetClass, RegisterClass Op1Class, DAGOperand Op2Class, string Constraint> : - Pseudo<(outs RetClass:$rd), - (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2, - AVL:$vl, sew:$sew), []>, - RISCVVPseudo { + RISCVVPseudo<(outs RetClass:$rd), + (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2, + AVL:$vl, sew:$sew), []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1561,10 +1530,9 @@ class VPseudoTernaryNoMaskWithPolicy<VReg RetClass, DAGOperand Op2Class, string Constraint = "", bits<2> TargetConstraintType = 1> : - Pseudo<(outs RetClass:$rd), - (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo { + RISCVVPseudo<(outs RetClass:$rd), + (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2, + AVL:$vl, sew:$sew, vec_policy:$policy), []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1580,10 +1548,10 @@ class VPseudoTernaryNoMaskWithPolicyRoundingMode<VReg RetClass, DAGOperand Op2Class, string Constraint = "", bits<2> TargetConstraintType = 1> : - Pseudo<(outs RetClass:$rd), - (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2, - vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo { + RISCVVPseudo<(outs RetClass:$rd), + (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2, + vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy), + []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1600,10 +1568,9 @@ class VPseudoTernaryNoMaskWithPolicyRoundingMode<VReg RetClass, class VPseudoUSSegLoadNoMask<VReg RetClass, int EEW, bits<4> NF> : - Pseudo<(outs RetClass:$rd), - (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$vl, - sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo, + RISCVVPseudo<(outs RetClass:$rd), + (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$vl, + sew:$sew, vec_policy:$policy), []>, RISCVVLSEG<NF, /*Masked*/0, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -1617,10 +1584,10 @@ class VPseudoUSSegLoadNoMask<VReg RetClass, class VPseudoUSSegLoadMask<VReg RetClass, int EEW, bits<4> NF> : - Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd), - (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1, - VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo, + RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), + (ins GetVRegNoV0<RetClass>.R:$passthru, + GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$vl, sew:$sew, + vec_policy:$policy), []>, RISCVVLSEG<NF, /*Masked*/1, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -1636,10 +1603,9 @@ class VPseudoUSSegLoadMask<VReg RetClass, class VPseudoUSSegLoadFFNoMask<VReg RetClass, int EEW, bits<4> NF> : - Pseudo<(outs RetClass:$rd, GPR:$vl), - (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$avl, - sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo, + RISCVVPseudo<(outs RetClass:$rd, GPR:$vl), + (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$avl, + sew:$sew, vec_policy:$policy), []>, RISCVVLSEG<NF, /*Masked*/0, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -1653,10 +1619,10 @@ class VPseudoUSSegLoadFFNoMask<VReg RetClass, class VPseudoUSSegLoadFFMask<VReg RetClass, int EEW, bits<4> NF> : - Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl), - (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1, - VMaskOp:$vm, AVL:$avl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo, + RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl), + (ins GetVRegNoV0<RetClass>.R:$passthru, + GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$avl, sew:$sew, + vec_policy:$policy), []>, RISCVVLSEG<NF, /*Masked*/1, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -1672,10 +1638,9 @@ class VPseudoUSSegLoadFFMask<VReg RetClass, class VPseudoSSegLoadNoMask<VReg RetClass, int EEW, bits<4> NF> : - Pseudo<(outs RetClass:$rd), - (ins RetClass:$passthru, GPRMemZeroOffset:$rs1, GPR:$offset, AVL:$vl, - sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo, + RISCVVPseudo<(outs RetClass:$rd), + (ins RetClass:$passthru, GPRMemZeroOffset:$rs1, GPR:$offset, + AVL:$vl, sew:$sew, vec_policy:$policy), []>, RISCVVLSEG<NF, /*Masked*/0, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -1689,11 +1654,10 @@ class VPseudoSSegLoadNoMask<VReg RetClass, class VPseudoSSegLoadMask<VReg RetClass, int EEW, bits<4> NF> : - Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd), - (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1, - GPR:$offset, VMaskOp:$vm, AVL:$vl, sew:$sew, - vec_policy:$policy), []>, - RISCVVPseudo, + RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), + (ins GetVRegNoV0<RetClass>.R:$passthru, + GPRMemZeroOffset:$rs1, GPR:$offset, VMaskOp:$vm, + AVL:$vl, sew:$sew, vec_policy:$policy), []>, RISCVVLSEG<NF, /*Masked*/1, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -1712,10 +1676,10 @@ class VPseudoISegLoadNoMask<VReg RetClass, bits<3> LMUL, bits<4> NF, bit Ordered> : - Pseudo<(outs RetClass:$rd), - (ins RetClass:$passthru, GPRMemZeroOffset:$rs1, IdxClass:$offset, AVL:$vl, - sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo, + RISCVVPseudo<(outs RetClass:$rd), + (ins RetClass:$passthru, GPRMemZeroOffset:$rs1, + IdxClass:$offset, AVL:$vl, sew:$sew, + vec_policy:$policy), []>, RISCVVLXSEG<NF, /*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> { let mayLoad = 1; let mayStore = 0; @@ -1734,11 +1698,10 @@ class VPseudoISegLoadMask<VReg RetClass, bits<3> LMUL, bits<4> NF, bit Ordered> : - Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd), - (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1, - IdxClass:$offset, VMaskOp:$vm, AVL:$vl, sew:$sew, - vec_policy:$policy), []>, - RISCVVPseudo, + RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), + (ins GetVRegNoV0<RetClass>.R:$passthru, + GPRMemZeroOffset:$rs1, IdxClass:$offset, VMaskOp:$vm, + AVL:$vl, sew:$sew, vec_policy:$policy), []>, RISCVVLXSEG<NF, /*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> { let mayLoad = 1; let mayStore = 0; @@ -1756,9 +1719,9 @@ class VPseudoISegLoadMask<VReg RetClass, class VPseudoUSSegStoreNoMask<VReg ValClass, int EEW, bits<4> NF> : - Pseudo<(outs), - (ins ValClass:$rd, GPRMemZeroOffset:$rs1, AVL:$vl, sew:$sew), []>, - RISCVVPseudo, + RISCVVPseudo<(outs), + (ins ValClass:$rd, GPRMemZeroOffset:$rs1, AVL:$vl, sew:$sew), + []>, RISCVVSSEG<NF, /*Masked*/0, /*Strided*/0, !logtwo(EEW), VLMul> { let mayLoad = 0; let mayStore = 1; @@ -1770,10 +1733,9 @@ class VPseudoUSSegStoreNoMask<VReg ValClass, class VPseudoUSSegStoreMask<VReg ValClass, int EEW, bits<4> NF> : - Pseudo<(outs), - (ins ValClass:$rd, GPRMemZeroOffset:$rs1, - VMaskOp:$vm, AVL:$vl, sew:$sew), []>, - RISCVVPseudo, + RISCVVPseudo<(outs), + (ins ValClass:$rd, GPRMemZeroOffset:$rs1, + VMaskOp:$vm, AVL:$vl, sew:$sew), []>, RISCVVSSEG<NF, /*Masked*/1, /*Strided*/0, !logtwo(EEW), VLMul> { let mayLoad = 0; let mayStore = 1; @@ -1786,10 +1748,9 @@ class VPseudoUSSegStoreMask<VReg ValClass, class VPseudoSSegStoreNoMask<VReg ValClass, int EEW, bits<4> NF> : - Pseudo<(outs), - (ins ValClass:$rd, GPRMemZeroOffset:$rs1, GPR:$offset, - AVL:$vl, sew:$sew), []>, - RISCVVPseudo, + RISCVVPseudo<(outs), + (ins ValClass:$rd, GPRMemZeroOffset:$rs1, GPR:$offset, + AVL:$vl, sew:$sew), []>, RISCVVSSEG<NF, /*Masked*/0, /*Strided*/1, !logtwo(EEW), VLMul> { let mayLoad = 0; let mayStore = 1; @@ -1801,10 +1762,9 @@ class VPseudoSSegStoreNoMask<VReg ValClass, class VPseudoSSegStoreMask<VReg ValClass, int EEW, bits<4> NF> : - Pseudo<(outs), - (ins ValClass:$rd, GPRMemZeroOffset:$rs1, GPR: $offset, - VMaskOp:$vm, AVL:$vl, sew:$sew), []>, - RISCVVPseudo, + RISCVVPseudo<(outs), + (ins ValClass:$rd, GPRMemZeroOffset:$rs1, GPR: $offset, + VMaskOp:$vm, AVL:$vl, sew:$sew), []>, RISCVVSSEG<NF, /*Masked*/1, /*Strided*/1, !logtwo(EEW), VLMul> { let mayLoad = 0; let mayStore = 1; @@ -1820,10 +1780,9 @@ class VPseudoISegStoreNoMask<VReg ValClass, bits<3> LMUL, bits<4> NF, bit Ordered> : - Pseudo<(outs), - (ins ValClass:$rd, GPRMemZeroOffset:$rs1, IdxClass: $index, - AVL:$vl, sew:$sew), []>, - RISCVVPseudo, + RISCVVPseudo<(outs), + (ins ValClass:$rd, GPRMemZeroOffset:$rs1, IdxClass: $index, + AVL:$vl, sew:$sew), []>, RISCVVSXSEG<NF, /*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> { let mayLoad = 0; let mayStore = 1; @@ -1838,10 +1797,9 @@ class VPseudoISegStoreMask<VReg ValClass, bits<3> LMUL, bits<4> NF, bit Ordered> : - Pseudo<(outs), - (ins ValClass:$rd, GPRMemZeroOffset:$rs1, IdxClass: $index, - VMaskOp:$vm, AVL:$vl, sew:$sew), []>, - RISCVVPseudo, + RISCVVPseudo<(outs), + (ins ValClass:$rd, GPRMemZeroOffset:$rs1, IdxClass: $index, + VMaskOp:$vm, AVL:$vl, sew:$sew), []>, RISCVVSXSEG<NF, /*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> { let mayLoad = 0; let mayStore = 1; @@ -6745,16 +6703,14 @@ let Predicates = [HasVInstructions] in { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { let HasSEWOp = 1, BaseInstr = VMV_X_S in def PseudoVMV_X_S: - Pseudo<(outs GPR:$rd), (ins VR:$rs2, sew:$sew), []>, - Sched<[WriteVMovXS, ReadVMovXS]>, - RISCVVPseudo; + RISCVVPseudo<(outs GPR:$rd), (ins VR:$rs2, sew:$sew), []>, + Sched<[WriteVMovXS, ReadVMovXS]>; let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X, isReMaterializable = 1, Constraints = "$rd = $passthru" in - def PseudoVMV_S_X: Pseudo<(outs VR:$rd), + def PseudoVMV_S_X: RISCVVPseudo<(outs VR:$rd), (ins VR:$passthru, GPR:$rs1, AVL:$vl, sew:$sew), []>, - Sched<[WriteVMovSX, ReadVMovSX_V, ReadVMovSX_X]>, - RISCVVPseudo; + Sched<[WriteVMovSX, ReadVMovSX_V, ReadVMovSX_X]>; } } // Predicates = [HasVInstructions] @@ -6767,18 +6723,16 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { foreach f = FPList in { let HasSEWOp = 1, BaseInstr = VFMV_F_S in def "PseudoVFMV_" # f.FX # "_S" : - Pseudo<(outs f.fprclass:$rd), + RISCVVPseudo<(outs f.fprclass:$rd), (ins VR:$rs2, sew:$sew), []>, - Sched<[WriteVMovFS, ReadVMovFS]>, - RISCVVPseudo; + Sched<[WriteVMovFS, ReadVMovFS]>; let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F, isReMaterializable = 1, Constraints = "$rd = $passthru" in def "PseudoVFMV_S_" # f.FX : - Pseudo<(outs VR:$rd), + RISCVVPseudo<(outs VR:$rd), (ins VR:$passthru, f.fprclass:$rs1, AVL:$vl, sew:$sew), []>, - Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>, - RISCVVPseudo; + Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>; } } } // Predicates = [HasVInstructionsAnyF] diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td index 5220815..1bb67f4 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td @@ -448,11 +448,10 @@ class NDSRVInstVLN<bits<5> funct5, string opcodestr> } class VPseudoVLN8NoMask<VReg RetClass, bit U> : - Pseudo<(outs RetClass:$rd), - (ins RetClass:$dest, - GPRMemZeroOffset:$rs1, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo, + RISCVVPseudo<(outs RetClass:$rd), + (ins RetClass:$dest, + GPRMemZeroOffset:$rs1, + AVL:$vl, sew:$sew, vec_policy:$policy), []>, RISCVNDSVLN</*Masked*/0, /*Unsigned*/U, !logtwo(8), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -464,11 +463,11 @@ class VPseudoVLN8NoMask<VReg RetClass, bit U> : } class VPseudoVLN8Mask<VReg RetClass, bit U> : - Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd), - (ins GetVRegNoV0<RetClass>.R:$passthru, - GPRMemZeroOffset:$rs1, - VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo, + RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), + (ins GetVRegNoV0<RetClass>.R:$passthru, + GPRMemZeroOffset:$rs1, + VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), + []>, RISCVNDSVLN</*Masked*/1, /*Unsigned*/U, !logtwo(8), VLMul> { let mayLoad = 1; let mayStore = 0; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td index 3912eb0..ebcf079 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td @@ -154,18 +154,17 @@ foreach m = MxList in { let VLMul = m.value in { let BaseInstr = RI_VEXTRACT in def PseudoRI_VEXTRACT_ # mx : - Pseudo<(outs GPR:$rd), (ins m.vrclass:$rs2, uimm5:$idx, ixlenimm:$sew), - []>, - RISCVVPseudo; + RISCVVPseudo<(outs GPR:$rd), + (ins m.vrclass:$rs2, uimm5:$idx, ixlenimm:$sew), + []>; let HasVLOp = 1, BaseInstr = RI_VINSERT, HasVecPolicyOp = 1, Constraints = "$rd = $rs1" in def PseudoRI_VINSERT_ # mx : - Pseudo<(outs m.vrclass:$rd), - (ins m.vrclass:$rs1, GPR:$rs2, uimm5:$idx, AVL:$vl, - ixlenimm:$sew, ixlenimm:$policy), - []>, - RISCVVPseudo; + RISCVVPseudo<(outs m.vrclass:$rd), + (ins m.vrclass:$rs1, GPR:$rs2, uimm5:$idx, AVL:$vl, + ixlenimm:$sew, ixlenimm:$policy), + []>; } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td index 17fb75e..a47dfe3 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td @@ -243,10 +243,9 @@ let Predicates = [HasVendorXSfvfnrclipxfqf], DecoderNamespace = "XSfvector", } class VPseudoVC_X<Operand OpClass, DAGOperand RS1Class> : - Pseudo<(outs), - (ins OpClass:$op1, payload5:$rs2, payload5:$rd, RS1Class:$r1, - AVL:$vl, sew:$sew), []>, - RISCVVPseudo { + RISCVVPseudo<(outs), + (ins OpClass:$op1, payload5:$rs2, payload5:$rd, RS1Class:$r1, + AVL:$vl, sew:$sew), []> { let mayLoad = 0; let mayStore = 0; let HasVLOp = 1; @@ -255,10 +254,9 @@ class VPseudoVC_X<Operand OpClass, DAGOperand RS1Class> : } class VPseudoVC_XV<Operand OpClass, VReg RS2Class, DAGOperand RS1Class> : - Pseudo<(outs), - (ins OpClass:$op1, payload5:$rd, RS2Class:$rs2, RS1Class:$r1, - AVL:$vl, sew:$sew), []>, - RISCVVPseudo { + RISCVVPseudo<(outs), + (ins OpClass:$op1, payload5:$rd, RS2Class:$rs2, RS1Class:$r1, + AVL:$vl, sew:$sew), []> { let mayLoad = 0; let mayStore = 0; let HasVLOp = 1; @@ -268,10 +266,9 @@ class VPseudoVC_XV<Operand OpClass, VReg RS2Class, DAGOperand RS1Class> : class VPseudoVC_XVV<Operand OpClass, VReg RDClass, VReg RS2Class, DAGOperand RS1Class> : - Pseudo<(outs), - (ins OpClass:$op1, RDClass:$rd, RS2Class:$rs2, RS1Class:$r1, - AVL:$vl, sew:$sew), []>, - RISCVVPseudo { + RISCVVPseudo<(outs), + (ins OpClass:$op1, RDClass:$rd, RS2Class:$rs2, RS1Class:$r1, + AVL:$vl, sew:$sew), []> { let mayLoad = 0; let mayStore = 0; let HasVLOp = 1; @@ -280,10 +277,9 @@ class VPseudoVC_XVV<Operand OpClass, VReg RDClass, VReg RS2Class, } class VPseudoVC_V_X<Operand OpClass, VReg RDClass, DAGOperand RS1Class> : - Pseudo<(outs RDClass:$rd), - (ins OpClass:$op1, payload5:$rs2, RS1Class:$r1, - AVL:$vl, sew:$sew), []>, - RISCVVPseudo { + RISCVVPseudo<(outs RDClass:$rd), + (ins OpClass:$op1, payload5:$rs2, RS1Class:$r1, + AVL:$vl, sew:$sew), []> { let mayLoad = 0; let mayStore = 0; let HasVLOp = 1; @@ -293,10 +289,9 @@ class VPseudoVC_V_X<Operand OpClass, VReg RDClass, DAGOperand RS1Class> : class VPseudoVC_V_XV<Operand OpClass, VReg RDClass, VReg RS2Class, DAGOperand RS1Class> : - Pseudo<(outs RDClass:$rd), - (ins OpClass:$op1, RS2Class:$rs2, RS1Class:$r1, - AVL:$vl, sew:$sew), []>, - RISCVVPseudo { + RISCVVPseudo<(outs RDClass:$rd), + (ins OpClass:$op1, RS2Class:$rs2, RS1Class:$r1, + AVL:$vl, sew:$sew), []> { let mayLoad = 0; let mayStore = 0; let HasVLOp = 1; @@ -306,10 +301,9 @@ class VPseudoVC_V_XV<Operand OpClass, VReg RDClass, VReg RS2Class, class VPseudoVC_V_XVV<Operand OpClass, VReg RDClass, VReg RS2Class, DAGOperand RS1Class> : - Pseudo<(outs RDClass:$rd), - (ins OpClass:$op1, RDClass:$rs3, RS2Class:$rs2, RS1Class:$r1, - AVL:$vl, sew:$sew), []>, - RISCVVPseudo { + RISCVVPseudo<(outs RDClass:$rd), + (ins OpClass:$op1, RDClass:$rs3, RS2Class:$rs2, RS1Class:$r1, + AVL:$vl, sew:$sew), []> { let mayLoad = 0; let mayStore = 0; let HasVLOp = 1; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td index 4147c97..a250ac8 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td @@ -230,9 +230,8 @@ class ZvkMxSet<string vd_lmul> { } class VPseudoBinaryNoMask_Zvk<DAGOperand RetClass, VReg OpClass> : - Pseudo<(outs RetClass:$rd_wb), - (ins RetClass:$rd, OpClass:$rs2, AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo { + RISCVVPseudo<(outs RetClass:$rd_wb), + (ins RetClass:$rd, OpClass:$rs2, AVL:$vl, sew:$sew, vec_policy:$policy), []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -246,10 +245,9 @@ class VPseudoBinaryNoMask_Zvk<DAGOperand RetClass, VReg OpClass> : class VPseudoTernaryNoMask_Zvk<VReg RetClass, VReg Op1Class, DAGOperand Op2Class> : - Pseudo<(outs RetClass:$rd_wb), + RISCVVPseudo<(outs RetClass:$rd_wb), (ins RetClass:$rd, Op1Class:$rs2, Op2Class:$rs1, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo { + AVL:$vl, sew:$sew, vec_policy:$policy), []> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index 6de870c..0565fcd 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -131,10 +131,14 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy, : Constant::getAllOnesValue(XLenTy); return true; } - if (auto *VPLdSt = dyn_cast<VPIntrinsic>(I)) { - assert((VPLdSt->getIntrinsicID() == Intrinsic::vp_load || - VPLdSt->getIntrinsicID() == Intrinsic::vp_store) && - "Unexpected intrinsic"); + + auto *II = cast<IntrinsicInst>(I); + switch (II->getIntrinsicID()) { + default: + llvm_unreachable("Unsupported intrinsic type"); + case Intrinsic::vp_load: + case Intrinsic::vp_store: { + auto *VPLdSt = cast<VPIntrinsic>(I); Ptr = VPLdSt->getMemoryPointerParam(); Alignment = VPLdSt->getPointerAlignment().value_or( DL.getABITypeAlign(VTy->getElementType())); @@ -151,21 +155,32 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy, VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); return true; } - auto *II = cast<IntrinsicInst>(I); - assert(II->getIntrinsicID() == Intrinsic::masked_load && - "Unexpected intrinsic"); - Ptr = II->getOperand(0); - Alignment = cast<ConstantInt>(II->getArgOperand(1))->getAlignValue(); + case Intrinsic::masked_load: { + Ptr = II->getOperand(0); + Alignment = cast<ConstantInt>(II->getArgOperand(1))->getAlignValue(); - if (!isa<UndefValue>(II->getOperand(3))) - return false; + if (!isa<UndefValue>(II->getOperand(3))) + return false; - assert(Mask && "masked.load needs a mask!"); + assert(Mask && "masked.load needs a mask!"); - VL = isa<FixedVectorType>(VTy) - ? Builder.CreateElementCount(XLenTy, VTy->getElementCount()) - : Constant::getAllOnesValue(XLenTy); - return true; + VL = isa<FixedVectorType>(VTy) + ? Builder.CreateElementCount(XLenTy, VTy->getElementCount()) + : Constant::getAllOnesValue(XLenTy); + return true; + } + case Intrinsic::masked_store: { + Ptr = II->getOperand(1); + Alignment = cast<ConstantInt>(II->getArgOperand(2))->getAlignValue(); + + assert(Mask && "masked.store needs a mask!"); + + VL = isa<FixedVectorType>(VTy) + ? Builder.CreateElementCount(XLenTy, VTy->getElementCount()) + : Constant::getAllOnesValue(XLenTy); + return true; + } + } } /// Lower an interleaved load into a vlsegN intrinsic. @@ -189,7 +204,7 @@ bool RISCVTargetLowering::lowerInterleavedLoad( const DataLayout &DL = Load->getDataLayout(); auto *VTy = cast<FixedVectorType>(Shuffles[0]->getType()); - auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); + auto *XLenTy = Builder.getIntNTy(Subtarget.getXLen()); Value *Ptr, *VL; Align Alignment; @@ -217,6 +232,7 @@ bool RISCVTargetLowering::lowerInterleavedLoad( Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load, {VTy, BasePtr->getType(), Stride->getType()}, {BasePtr, Stride, Mask, VL}); + Alignment = commonAlignment(Alignment, Indices[0] * ScalarSizeInBytes); CI->addParamAttr(0, Attribute::getWithAlignment(CI->getContext(), Alignment)); Shuffles[0]->replaceAllUsesWith(CI); @@ -250,22 +266,28 @@ bool RISCVTargetLowering::lowerInterleavedLoad( /// /// Note that the new shufflevectors will be removed and we'll only generate one /// vsseg3 instruction in CodeGen. -bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, +bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store, + Value *LaneMask, ShuffleVectorInst *SVI, unsigned Factor) const { - IRBuilder<> Builder(SI); - const DataLayout &DL = SI->getDataLayout(); + IRBuilder<> Builder(Store); + const DataLayout &DL = Store->getDataLayout(); auto Mask = SVI->getShuffleMask(); auto *ShuffleVTy = cast<FixedVectorType>(SVI->getType()); // Given SVI : <n*factor x ty>, then VTy : <n x ty> auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(), ShuffleVTy->getNumElements() / Factor); - if (!isLegalInterleavedAccessType(VTy, Factor, SI->getAlign(), - SI->getPointerAddressSpace(), DL)) + auto *XLenTy = Builder.getIntNTy(Subtarget.getXLen()); + + Value *Ptr, *VL; + Align Alignment; + if (!getMemOperands(Factor, VTy, XLenTy, Store, Ptr, LaneMask, VL, Alignment)) return false; - auto *PtrTy = SI->getPointerOperandType(); - auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen()); + Type *PtrTy = Ptr->getType(); + unsigned AS = PtrTy->getPointerAddressSpace(); + if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL)) + return false; unsigned Index; // If the segment store only has one active lane (i.e. the interleave is @@ -276,26 +298,27 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, unsigned ScalarSizeInBytes = DL.getTypeStoreSize(ShuffleVTy->getElementType()); Value *Data = SVI->getOperand(0); - auto *DataVTy = cast<FixedVectorType>(Data->getType()); + Data = Builder.CreateExtractVector(VTy, Data, uint64_t(0)); Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes); - Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset); - Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount()); - Value *VL = Builder.CreateElementCount(Builder.getInt32Ty(), - VTy->getElementCount()); - - CallInst *CI = Builder.CreateIntrinsic( - Intrinsic::experimental_vp_strided_store, - {Data->getType(), BasePtr->getType(), Stride->getType()}, - {Data, BasePtr, Stride, Mask, VL}); - CI->addParamAttr( - 1, Attribute::getWithAlignment(CI->getContext(), SI->getAlign())); + Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset); + // Note: Same VL as above, but i32 not xlen due to signature of + // vp.strided.store + VL = Builder.CreateElementCount(Builder.getInt32Ty(), + VTy->getElementCount()); + CallInst *CI = + Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store, + {VTy, BasePtr->getType(), Stride->getType()}, + {Data, BasePtr, Stride, LaneMask, VL}); + Alignment = commonAlignment(Alignment, Index * ScalarSizeInBytes); + CI->addParamAttr(1, + Attribute::getWithAlignment(CI->getContext(), Alignment)); return true; } Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( - SI->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}); + Store->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}); SmallVector<Value *, 10> Ops; SmallVector<int, 16> NewShuffleMask; @@ -311,13 +334,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, NewShuffleMask.clear(); } - // This VL should be OK (should be executable in one vsseg instruction, - // potentially under larger LMULs) because we checked that the fixed vector - // type fits in isLegalInterleavedAccessType - Value *VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount()); - Value *StoreMask = Builder.getAllOnesMask(VTy->getElementCount()); - Ops.append({SI->getPointerOperand(), StoreMask, VL}); - + Ops.append({Ptr, LaneMask, VL}); Builder.CreateCall(VssegNFunc, Ops); return true; @@ -334,7 +351,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( VectorType *ResVTy = getDeinterleavedVectorType(DI); const DataLayout &DL = Load->getDataLayout(); - auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); + auto *XLenTy = Builder.getIntNTy(Subtarget.getXLen()); Value *Ptr, *VL; Align Alignment; @@ -355,8 +372,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( unsigned NumElts = ResVTy->getElementCount().getKnownMinValue(); Type *VecTupTy = TargetExtType::get( Load->getContext(), "riscv.vector.tuple", - ScalableVectorType::get(Type::getInt8Ty(Load->getContext()), - NumElts * SEW / 8), + ScalableVectorType::get(Builder.getInt8Ty(), NumElts * SEW / 8), Factor); Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( Load->getModule(), ScalableVlsegIntrIds[Factor - 2], @@ -397,7 +413,7 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( auto *InVTy = cast<VectorType>(InterleaveValues[0]->getType()); const DataLayout &DL = Store->getDataLayout(); - Type *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen()); + Type *XLenTy = Builder.getIntNTy(Subtarget.getXLen()); Value *Ptr, *VL; Align Alignment; @@ -421,9 +437,7 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( unsigned NumElts = InVTy->getElementCount().getKnownMinValue(); Type *VecTupTy = TargetExtType::get( Store->getContext(), "riscv.vector.tuple", - ScalableVectorType::get(Type::getInt8Ty(Store->getContext()), - NumElts * SEW / 8), - Factor); + ScalableVectorType::get(Builder.getInt8Ty(), NumElts * SEW / 8), Factor); Value *StoredVal = PoisonValue::get(VecTupTy); for (unsigned i = 0; i < Factor; ++i) @@ -440,91 +454,3 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( Builder.CreateCall(VssegNFunc, Operands); return true; } - -/// Lower an interleaved vp.store into a vssegN intrinsic. -/// -/// E.g. Lower an interleaved vp.store (Factor = 2): -/// -/// %is = tail call <vscale x 64 x i8> -/// @llvm.vector.interleave2.nxv64i8( -/// <vscale x 32 x i8> %load0, -/// <vscale x 32 x i8> %load1 -/// %wide.rvl = shl nuw nsw i32 %rvl, 1 -/// tail call void @llvm.vp.store.nxv64i8.p0( -/// <vscale x 64 x i8> %is, ptr %ptr, -/// %mask, -/// i32 %wide.rvl) -/// -/// Into: -/// call void @llvm.riscv.vsseg2.mask.nxv32i8.i64( -/// <vscale x 32 x i8> %load1, -/// <vscale x 32 x i8> %load2, ptr %ptr, -/// %mask, -/// i64 %rvl) -bool RISCVTargetLowering::lowerInterleavedVPStore( - VPIntrinsic *Store, Value *Mask, - ArrayRef<Value *> InterleaveOperands) const { - assert(Mask && "Expect a valid mask"); - assert(Store->getIntrinsicID() == Intrinsic::vp_store && - "Unexpected intrinsic"); - - const unsigned Factor = InterleaveOperands.size(); - - auto *VTy = dyn_cast<VectorType>(InterleaveOperands[0]->getType()); - if (!VTy) - return false; - - const DataLayout &DL = Store->getDataLayout(); - Align Alignment = Store->getParamAlign(1).value_or( - DL.getABITypeAlign(VTy->getElementType())); - if (!isLegalInterleavedAccessType( - VTy, Factor, Alignment, - Store->getArgOperand(1)->getType()->getPointerAddressSpace(), DL)) - return false; - - IRBuilder<> Builder(Store); - Value *WideEVL = Store->getArgOperand(3); - // Conservatively check if EVL is a multiple of factor, otherwise some - // (trailing) elements might be lost after the transformation. - if (!isMultipleOfN(WideEVL, Store->getDataLayout(), Factor)) - return false; - - auto *PtrTy = Store->getArgOperand(1)->getType(); - auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen()); - auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); - Value *EVL = - Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); - - if (isa<FixedVectorType>(VTy)) { - SmallVector<Value *, 8> Operands(InterleaveOperands); - Operands.append({Store->getArgOperand(1), Mask, EVL}); - Builder.CreateIntrinsic(FixedVssegIntrIds[Factor - 2], - {VTy, PtrTy, XLenTy}, Operands); - return true; - } - - unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType()); - unsigned NumElts = VTy->getElementCount().getKnownMinValue(); - Type *VecTupTy = TargetExtType::get( - Store->getContext(), "riscv.vector.tuple", - ScalableVectorType::get(Type::getInt8Ty(Store->getContext()), - NumElts * SEW / 8), - Factor); - - Function *VecInsertFunc = Intrinsic::getOrInsertDeclaration( - Store->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, VTy}); - Value *StoredVal = PoisonValue::get(VecTupTy); - for (unsigned i = 0; i < Factor; ++i) - StoredVal = Builder.CreateCall( - VecInsertFunc, {StoredVal, InterleaveOperands[i], Builder.getInt32(i)}); - - Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( - Store->getModule(), ScalableVssegIntrIds[Factor - 2], - {VecTupTy, PtrTy, Mask->getType(), EVL->getType()}); - - Value *Operands[] = {StoredVal, Store->getArgOperand(1), Mask, EVL, - ConstantInt::get(XLenTy, Log2_64(SEW))}; - - Builder.CreateCall(VssegNFunc, Operands); - return true; -} diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td index 3e286a7..bf23812 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td @@ -24,6 +24,67 @@ class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0 bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW)); } +defvar SMX60VLEN = 256; +defvar SMX60DLEN = !div(SMX60VLEN, 2); + +class Get1248Latency<string mx> { + int c = !cond( + !eq(mx, "M2") : 2, + !eq(mx, "M4") : 4, + !eq(mx, "M8") : 8, + true: 1 + ); +} + +// Used for: logical opsz, shifts, sign ext, merge/move, FP sign/recip/convert, mask ops, slides +class Get4816Latency<string mx> { + int c = !cond( + !eq(mx, "M4") : 8, + !eq(mx, "M8") : 16, + true: 4 + ); +} + +// Used for: arithmetic (add/sub/min/max), saturating/averaging, FP add/sub/min/max +class Get458Latency<string mx> { + int c = !cond( + !eq(mx, "M4") : 5, + !eq(mx, "M8") : 8, + true: 4 + ); +} + +// Widening scaling pattern (4,4,4,4,5,8,8): plateaus at higher LMULs +// Used for: widening operations +class Get4588Latency<string mx> { + int c = !cond( + !eq(mx, "M2") : 5, + !eq(mx, "M4") : 8, + !eq(mx, "M8") : 8, // M8 not supported for most widening, fallback + true: 4 + ); +} + +// Used for: mask-producing comparisons, carry ops with mask, FP comparisons +class Get461018Latency<string mx> { + int c = !cond( + !eq(mx, "M2") : 6, + !eq(mx, "M4") : 10, + !eq(mx, "M8") : 18, + true: 4 + ); +} + +// Used for: e64 multiply pattern, complex ops +class Get781632Latency<string mx> { + int c = !cond( + !eq(mx, "M2") : 8, + !eq(mx, "M4") : 16, + !eq(mx, "M8") : 32, + true: 7 + ); +} + def SpacemitX60Model : SchedMachineModel { let IssueWidth = 2; // dual-issue let MicroOpBufferSize = 0; // in-order @@ -322,58 +383,96 @@ foreach LMul = [1, 2, 4, 8] in { foreach mx = SchedMxList in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; - defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>; - - defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>; - - defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>; + let Latency = Get458Latency<mx>.c, ReleaseAtCycles = [4] in { + defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>; + } + + let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [4] in { + // Pattern of vadd, vsub, vrsub: 4/4/5/8 + // Pattern of vand, vor, vxor: 4/4/8/16 + // They are grouped together, so we used the worst case 4/4/8/16 + // TODO: use InstRW to override individual instructions' scheduling data + defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>; + } + + let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [4] in { + defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>; + } + + // Pattern of vmacc, vmadd, vmul, vmulh, etc.: e8/e16 = 4/4/5/8, e32 = 5,5,5,8, + // e64 = 7,8,16,32. We use the worst-case until we can split the SEW. + // TODO: change WriteVIMulV, etc to be defined with LMULSEWSchedWrites + let Latency = Get781632Latency<mx>.c, ReleaseAtCycles = [7] in { + defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>; + } } // Widening +// Pattern of vwmul, vwmacc, etc: e8/e16 = 4/4/5/8, e32 = 5,5,5,8 +// We use the worst-case for all. foreach mx = SchedMxListW in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c; - defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>; + let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [4] in { + defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>; + } } -// Vector Integer Division and Remainder +// Division and remainder operations +// Pattern of vdivu: 11/11/11/20/40/80/160 +// Pattern of vdiv: 12/12/12/22/44/88/176 +// Pattern of vremu: 12/12/12/22/44/88/176 +// Pattern of vrem: 13/13/13/24/48/96/192 +// We use for all: 12/12/12/24/48/96/192 +// TODO: Create separate WriteVIRem to more closely match the latencies foreach mx = SchedMxList in { foreach sew = SchedSEWSet<mx>.val in { defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c; - defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>; + // Slightly reduced for fractional LMULs + defvar Multiplier = !cond( + !eq(mx, "MF8") : 12, + !eq(mx, "MF4") : 12, + !eq(mx, "MF2") : 12, + true: 24 + ); + + let Latency = !mul(Get1248Latency<mx>.c, Multiplier), ReleaseAtCycles = [12] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>; + } } } @@ -381,12 +480,21 @@ foreach mx = SchedMxList in { foreach mx = SchedMxListW in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c; - defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>; + // Slightly increased for integer LMULs + defvar Multiplier = !cond( + !eq(mx, "M2") : 2, + !eq(mx, "M4") : 2, + true: 1 + ); + + let Latency = !mul(Get4816Latency<mx>.c, Multiplier), ReleaseAtCycles = [4] in { + defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>; + } } // 12. Vector Fixed-Point Arithmetic Instructions diff --git a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp index 668e596..6ecddad 100644 --- a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp @@ -24,6 +24,18 @@ void RISCVSelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG, switch (N->getOpcode()) { default: return SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N); + case RISCVISD::TUPLE_EXTRACT: + assert(N->getNumOperands() == 2 && "Expected three operands!"); + assert(N->getOperand(1).getOpcode() == ISD::TargetConstant && + N->getOperand(1).getValueType() == MVT::i32 && + "Expected index to be an i32 target constant!"); + break; + case RISCVISD::TUPLE_INSERT: + assert(N->getNumOperands() == 3 && "Expected three operands!"); + assert(N->getOperand(2).getOpcode() == ISD::TargetConstant && + N->getOperand(2).getValueType() == MVT::i32 && + "Expected index to be an i32 target constant!"); + break; case RISCVISD::VQDOT_VL: case RISCVISD::VQDOTU_VL: case RISCVISD::VQDOTSU_VL: { diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp index c754de4..e35ffaf 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp @@ -216,7 +216,7 @@ unsigned RISCVSubtarget::getMinimumJumpTableEntries() const { } void RISCVSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, - unsigned NumRegionInstrs) const { + const SchedRegion &Region) const { // Do bidirectional scheduling since it provides a more balanced scheduling // leading to better performance. This will increase compile time. Policy.OnlyTopDown = false; @@ -231,8 +231,8 @@ void RISCVSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, Policy.ShouldTrackPressure = true; } -void RISCVSubtarget::overridePostRASchedPolicy(MachineSchedPolicy &Policy, - unsigned NumRegionInstrs) const { +void RISCVSubtarget::overridePostRASchedPolicy( + MachineSchedPolicy &Policy, const SchedRegion &Region) const { MISched::Direction PostRASchedDirection = getPostRASchedDirection(); if (PostRASchedDirection == MISched::TopDown) { Policy.OnlyTopDown = true; diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 4f560cc..fd57e02 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -395,11 +395,11 @@ public: } void overrideSchedPolicy(MachineSchedPolicy &Policy, - unsigned NumRegionInstrs) const override; + const SchedRegion &Region) const override; void overridePostRASchedPolicy(MachineSchedPolicy &Policy, - unsigned NumRegionInstrs) const override; + const SchedRegion &Region) const override; }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 56ead92..fd634b5 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1489,6 +1489,34 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind, 0, cast<VectorType>(ICA.getReturnType())); } + case Intrinsic::fptoui_sat: + case Intrinsic::fptosi_sat: { + InstructionCost Cost = 0; + bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; + Type *SrcTy = ICA.getArgTypes()[0]; + + auto SrcLT = getTypeLegalizationCost(SrcTy); + auto DstLT = getTypeLegalizationCost(RetTy); + if (!SrcTy->isVectorTy()) + break; + + if (!SrcLT.first.isValid() || !DstLT.first.isValid()) + return InstructionCost::getInvalid(); + + Cost += + getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI, + RetTy, SrcTy, TTI::CastContextHint::None, CostKind); + + // Handle NaN. + // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1. + // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0. + Type *CondTy = RetTy->getWithNewBitWidth(1); + Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy, + CmpInst::FCMP_UNO, CostKind); + Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, + CmpInst::FCMP_UNO, CostKind); + return Cost; + } } if (ST->hasVInstructions() && RetTy->isVectorTy()) { diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 12bf8c1..d62d99c 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -116,8 +116,8 @@ public: } TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const override { - return ST->hasVInstructions() ? TailFoldingStyle::Data - : TailFoldingStyle::DataWithoutLaneMask; + return ST->hasVInstructions() ? TailFoldingStyle::DataWithEVL + : TailFoldingStyle::None; } std::optional<unsigned> getMaxVScale() const override; std::optional<unsigned> getVScaleForTuning() const override; diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index 15bd346..c946451 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -114,14 +114,6 @@ FunctionPass *llvm::createRISCVVLOptimizerPass() { return new RISCVVLOptimizer(); } -/// Return true if R is a physical or virtual vector register, false otherwise. -static bool isVectorRegClass(Register R, const MachineRegisterInfo *MRI) { - if (R.isPhysical()) - return RISCV::VRRegClass.contains(R); - const TargetRegisterClass *RC = MRI->getRegClass(R); - return RISCVRI::isVRegClass(RC->TSFlags); -} - LLVM_ATTRIBUTE_UNUSED static raw_ostream &operator<<(raw_ostream &OS, const OperandInfo &OI) { OI.print(OS); @@ -183,37 +175,28 @@ static unsigned getIntegerExtensionOperandEEW(unsigned Factor, return Log2EEW; } -/// Check whether MO is a mask operand of MI. -static bool isMaskOperand(const MachineInstr &MI, const MachineOperand &MO, - const MachineRegisterInfo *MRI) { - - if (!MO.isReg() || !isVectorRegClass(MO.getReg(), MRI)) - return false; - - const MCInstrDesc &Desc = MI.getDesc(); - return Desc.operands()[MO.getOperandNo()].RegClass == RISCV::VMV0RegClassID; -} - static std::optional<unsigned> getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) { const MachineInstr &MI = *MO.getParent(); + const MCInstrDesc &Desc = MI.getDesc(); const RISCVVPseudosTable::PseudoInfo *RVV = RISCVVPseudosTable::getPseudoInfo(MI.getOpcode()); assert(RVV && "Could not find MI in PseudoTable"); // MI has a SEW associated with it. The RVV specification defines // the EEW of each operand and definition in relation to MI.SEW. - unsigned MILog2SEW = - MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm(); + unsigned MILog2SEW = MI.getOperand(RISCVII::getSEWOpNum(Desc)).getImm(); - const bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(MI.getDesc()); - const bool IsTied = RISCVII::isTiedPseudo(MI.getDesc().TSFlags); + const bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(Desc); + const bool IsTied = RISCVII::isTiedPseudo(Desc.TSFlags); bool IsMODef = MO.getOperandNo() == 0 || (HasPassthru && MO.getOperandNo() == MI.getNumExplicitDefs()); // All mask operands have EEW=1 - if (isMaskOperand(MI, MO, MRI)) + const MCOperandInfo &Info = Desc.operands()[MO.getOperandNo()]; + if (Info.OperandType == MCOI::OPERAND_REGISTER && + Info.RegClass == RISCV::VMV0RegClassID) return 0; // switch against BaseInstr to reduce number of cases that need to be @@ -1296,8 +1279,8 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const { TII->get(RISCV::getRVVMCOpcode(MI.getOpcode())).TSFlags) && "Instruction shouldn't be supported if elements depend on VL"); - assert(MI.getOperand(0).isReg() && - isVectorRegClass(MI.getOperand(0).getReg(), MRI) && + assert(RISCVRI::isVRegClass( + MRI->getRegClass(MI.getOperand(0).getReg())->TSFlags) && "All supported instructions produce a vector register result"); LLVM_DEBUG(dbgs() << "Found a candidate for VL reduction: " << MI << "\n"); @@ -1486,7 +1469,6 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const { } bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) { - assert(DemandedVLs.size() == 0); if (skipFunction(MF.getFunction())) return false; @@ -1499,6 +1481,8 @@ bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) { TII = ST.getInstrInfo(); + assert(DemandedVLs.empty()); + // For each instruction that defines a vector, compute what VL its // downstream users demand. for (MachineBasicBlock *MBB : post_order(&MF)) { diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp index 84ef539..c1cc19b 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp @@ -434,6 +434,15 @@ bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) { if (!isKnownSameDefs(TrueMask.getReg(), MIMask.getReg())) return false; + // Masked off lanes past TrueVL will come from False, and converting to vmv + // will lose these lanes unless MIVL <= TrueVL. + // TODO: We could relax this for False == Passthru and True policy == TU + const MachineOperand &MIVL = MI.getOperand(RISCVII::getVLOpNum(MI.getDesc())); + const MachineOperand &TrueVL = + True->getOperand(RISCVII::getVLOpNum(True->getDesc())); + if (!RISCV::isVLKnownLE(MIVL, TrueVL)) + return false; + // True's passthru needs to be equivalent to False Register TruePassthruReg = True->getOperand(1).getReg(); Register FalseReg = MI.getOperand(2).getReg(); diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index b90e1aa..3c631ce 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -665,10 +665,10 @@ Type *SPIRVEmitIntrinsics::deduceElementTypeHelper( auto *HandleType = cast<TargetExtType>(II->getOperand(0)->getType()); if (HandleType->getTargetExtName() == "spirv.Image" || HandleType->getTargetExtName() == "spirv.SignedImage") { - if (II->hasOneUse()) { - auto *U = *II->users().begin(); + for (User *U : II->users()) { Ty = cast<Instruction>(U)->getAccessType(); - assert(Ty && "Unable to get type for resource pointer."); + if (Ty) + break; } } else if (HandleType->getTargetExtName() == "spirv.VulkanBuffer") { // This call is supposed to index into an array diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp index 6766bd8..595424b 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp @@ -410,6 +410,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) { II, Intrinsic::SPVIntrinsics::spv_lifetime_start, {1}); } else { II->eraseFromParent(); + Changed = true; } break; case Intrinsic::lifetime_end: @@ -418,6 +419,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) { II, Intrinsic::SPVIntrinsics::spv_lifetime_end, {1}); } else { II->eraseFromParent(); + Changed = true; } break; case Intrinsic::ptr_annotation: diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp index 4a9c88b..a95c4ff 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/SparcFixupKinds.h" -#include "MCTargetDesc/SparcMCAsmInfo.h" #include "MCTargetDesc/SparcMCTargetDesc.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFObjectWriter.h" diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp index 1ee6e80..79da53e 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp @@ -13,10 +13,7 @@ #include "MCTargetDesc/SparcMCAsmInfo.h" #include "llvm/BinaryFormat/ELF.h" -#include "llvm/MC/MCAssembler.h" -#include "llvm/MC/MCContext.h" #include "llvm/MC/MCObjectStreamer.h" -#include "llvm/MC/MCValue.h" using namespace llvm; diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index 9b434d8..1aa8efe 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -2201,7 +2201,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op, SDValue Chain = DAG.getEntryNode(); SDValue InGlue; - Chain = DAG.getCALLSEQ_START(Chain, 1, 0, DL); + Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); Chain = DAG.getCopyToReg(Chain, DL, SP::O0, Argument, InGlue); InGlue = Chain.getValue(1); SDValue Callee = DAG.getTargetExternalSymbol("__tls_get_addr", PtrVT); @@ -2219,7 +2219,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op, InGlue}; Chain = DAG.getNode(SPISD::TLS_CALL, DL, NodeTys, Ops); InGlue = Chain.getValue(1); - Chain = DAG.getCALLSEQ_END(Chain, 1, 0, InGlue, DL); + Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL); InGlue = Chain.getValue(1); SDValue Ret = DAG.getCopyFromReg(Chain, DL, SP::O0, PtrVT, InGlue); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp index 2662241e..e6486e2 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp @@ -256,9 +256,17 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) { // Precompute the set of registers that are unused, so that we can insert // drops to their defs. + // And unstackify any stackified registers that don't have any uses, so that + // they can be dropped later. This can happen when transformations after + // RegStackify remove instructions using stackified registers. BitVector UseEmpty(MRI.getNumVirtRegs()); - for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) - UseEmpty[I] = MRI.use_empty(Register::index2VirtReg(I)); + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) { + Register Reg = Register::index2VirtReg(I); + if (MRI.use_empty(Reg)) { + UseEmpty[I] = true; + MFI.unstackifyVReg(Reg); + } + } // Visit each instruction in the function. for (MachineBasicBlock &MBB : MF) { diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp index ac819cf..b03b350 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp @@ -15,12 +15,14 @@ #include "WebAssembly.h" #include "WebAssemblyISelLowering.h" #include "WebAssemblyTargetMachine.h" +#include "WebAssemblyUtilities.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/CodeGen/WasmEHFuncInfo.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" // To access function attributes. #include "llvm/IR/IntrinsicsWebAssembly.h" +#include "llvm/MC/MCSymbolWasm.h" #include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/raw_ostream.h" @@ -118,6 +120,51 @@ static SDValue getTagSymNode(int Tag, SelectionDAG *DAG) { return DAG->getTargetExternalSymbol(SymName, PtrVT); } +static APInt encodeFunctionSignature(SelectionDAG *DAG, SDLoc &DL, + SmallVector<MVT, 4> &Returns, + SmallVector<MVT, 4> &Params) { + auto toWasmValType = [](MVT VT) { + if (VT == MVT::i32) { + return wasm::ValType::I32; + } + if (VT == MVT::i64) { + return wasm::ValType::I64; + } + if (VT == MVT::f32) { + return wasm::ValType::F32; + } + if (VT == MVT::f64) { + return wasm::ValType::F64; + } + LLVM_DEBUG(errs() << "Unhandled type for llvm.wasm.ref.test.func: " << VT + << "\n"); + llvm_unreachable("Unhandled type for llvm.wasm.ref.test.func"); + }; + auto NParams = Params.size(); + auto NReturns = Returns.size(); + auto BitWidth = (NParams + NReturns + 2) * 64; + auto Sig = APInt(BitWidth, 0); + + // Annoying special case: if getSignificantBits() <= 64 then InstrEmitter will + // emit an Imm instead of a CImm. It simplifies WebAssemblyMCInstLower if we + // always emit a CImm. So xor NParams with 0x7ffffff to ensure + // getSignificantBits() > 64 + Sig |= NReturns ^ 0x7ffffff; + for (auto &Return : Returns) { + auto V = toWasmValType(Return); + Sig <<= 64; + Sig |= (int64_t)V; + } + Sig <<= 64; + Sig |= NParams; + for (auto &Param : Params) { + auto V = toWasmValType(Param); + Sig <<= 64; + Sig |= (int64_t)V; + } + return Sig; +} + void WebAssemblyDAGToDAGISel::Select(SDNode *Node) { // If we have a custom node, we already have selected! if (Node->isMachineOpcode()) { @@ -189,6 +236,58 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, TLSAlign); return; } + case Intrinsic::wasm_ref_test_func: { + // First emit the TABLE_GET instruction to convert function pointer ==> + // funcref + MachineFunction &MF = CurDAG->getMachineFunction(); + auto PtrVT = MVT::getIntegerVT(MF.getDataLayout().getPointerSizeInBits()); + MCSymbol *Table = WebAssembly::getOrCreateFunctionTableSymbol( + MF.getContext(), Subtarget); + SDValue TableSym = CurDAG->getMCSymbol(Table, PtrVT); + SDValue FuncPtr = Node->getOperand(1); + if (Subtarget->hasAddr64() && FuncPtr.getValueType() == MVT::i64) { + // table.get expects an i32 but on 64 bit platforms the function pointer + // is an i64. In that case, i32.wrap_i64 to convert. + FuncPtr = SDValue(CurDAG->getMachineNode(WebAssembly::I32_WRAP_I64, DL, + MVT::i32, FuncPtr), + 0); + } + SDValue FuncRef = + SDValue(CurDAG->getMachineNode(WebAssembly::TABLE_GET_FUNCREF, DL, + MVT::funcref, TableSym, FuncPtr), + 0); + + // Encode the signature information into the type index placeholder. + // This gets decoded and converted into the actual type signature in + // WebAssemblyMCInstLower.cpp. + SmallVector<MVT, 4> Params; + SmallVector<MVT, 4> Returns; + + bool IsParam = false; + // Operand 0 is the return register, Operand 1 is the function pointer. + // The remaining operands encode the type of the function we are testing + // for. + for (unsigned I = 2, E = Node->getNumOperands(); I < E; ++I) { + MVT VT = Node->getOperand(I).getValueType().getSimpleVT(); + if (VT == MVT::Untyped) { + IsParam = true; + continue; + } + if (IsParam) { + Params.push_back(VT); + } else { + Returns.push_back(VT); + } + } + auto Sig = encodeFunctionSignature(CurDAG, DL, Returns, Params); + + auto SigOp = CurDAG->getTargetConstant( + Sig, DL, EVT::getIntegerVT(*CurDAG->getContext(), Sig.getBitWidth())); + MachineSDNode *RefTestNode = CurDAG->getMachineNode( + WebAssembly::REF_TEST_FUNCREF, DL, MVT::i32, {SigOp, FuncRef}); + ReplaceNode(Node, RefTestNode); + return; + } } break; } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 09b8864..11936a3 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -798,6 +798,7 @@ LowerCallResults(MachineInstr &CallResults, DebugLoc DL, MachineBasicBlock *BB, if (IsIndirect) { // Placeholder for the type index. + // This gets replaced with the correct value in WebAssemblyMCInstLower.cpp MIB.addImm(0); // The table into which this call_indirect indexes. MCSymbolWasm *Table = IsFuncrefCall diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp index cc36244..4613fcb 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp @@ -15,13 +15,18 @@ #include "WebAssemblyMCInstLower.h" #include "MCTargetDesc/WebAssemblyMCAsmInfo.h" #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "MCTargetDesc/WebAssemblyMCTypeUtilities.h" #include "TargetInfo/WebAssemblyTargetInfo.h" #include "Utils/WebAssemblyTypeUtilities.h" #include "WebAssemblyAsmPrinter.h" #include "WebAssemblyMachineFunctionInfo.h" #include "WebAssemblyUtilities.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/BinaryFormat/Wasm.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/IR/Constants.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" @@ -152,6 +157,34 @@ MCOperand WebAssemblyMCInstLower::lowerTypeIndexOperand( return MCOperand::createExpr(Expr); } +MCOperand +WebAssemblyMCInstLower::lowerEncodedFunctionSignature(const APInt &Sig) const { + // For APInt a word is 64 bits on all architectures, see definition in APInt.h + auto NumWords = Sig.getNumWords(); + SmallVector<wasm::ValType, 4> Params; + SmallVector<wasm::ValType, 2> Returns; + + int Idx = NumWords; + auto GetWord = [&Idx, &Sig]() { + Idx--; + return Sig.extractBitsAsZExtValue(64, 64 * Idx); + }; + // Annoying special case: if getSignificantBits() <= 64 then InstrEmitter will + // emit an Imm instead of a CImm. It simplifies WebAssemblyMCInstLower if we + // always emit a CImm. So xor NParams with 0x7ffffff to ensure + // getSignificantBits() > 64 + // See encodeFunctionSignature in WebAssemblyISelDAGtoDAG.cpp + int NReturns = GetWord() ^ 0x7ffffff; + for (int I = 0; I < NReturns; I++) { + Returns.push_back(static_cast<wasm::ValType>(GetWord())); + } + int NParams = GetWord(); + for (int I = 0; I < NParams; I++) { + Params.push_back(static_cast<wasm::ValType>(GetWord())); + } + return lowerTypeIndexOperand(std::move(Returns), std::move(Params)); +} + static void getFunctionReturns(const MachineInstr *MI, SmallVectorImpl<wasm::ValType> &Returns) { const Function &F = MI->getMF()->getFunction(); @@ -196,11 +229,30 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI, MCOp = MCOperand::createReg(WAReg); break; } + case llvm::MachineOperand::MO_CImmediate: { + // Lower type index placeholder for ref.test + // Currently this is the only way that CImmediates show up so panic if we + // get confused. + unsigned DescIndex = I - NumVariadicDefs; + assert(DescIndex < Desc.NumOperands && "unexpected CImmediate operand"); + auto Operands = Desc.operands(); + const MCOperandInfo &Info = Operands[DescIndex]; + assert(Info.OperandType == WebAssembly::OPERAND_TYPEINDEX && + "unexpected CImmediate operand"); + (void)Info; + MCOp = lowerEncodedFunctionSignature(MO.getCImm()->getValue()); + break; + } case MachineOperand::MO_Immediate: { unsigned DescIndex = I - NumVariadicDefs; if (DescIndex < Desc.NumOperands) { - const MCOperandInfo &Info = Desc.operands()[DescIndex]; + auto Operands = Desc.operands(); + const MCOperandInfo &Info = Operands[DescIndex]; + // Replace type index placeholder with actual type index. The type index + // placeholders are Immediates and have an operand type of + // OPERAND_TYPEINDEX or OPERAND_SIGNATURE. if (Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) { + // Lower type index placeholder for a CALL_INDIRECT instruction SmallVector<wasm::ValType, 4> Returns; SmallVector<wasm::ValType, 4> Params; @@ -228,6 +280,7 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI, break; } if (Info.OperandType == WebAssembly::OPERAND_SIGNATURE) { + // Lower type index placeholder for blocks auto BT = static_cast<WebAssembly::BlockType>(MO.getImm()); assert(BT != WebAssembly::BlockType::Invalid); if (BT == WebAssembly::BlockType::Multivalue) { diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h index 9f08499..34404d9 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h @@ -36,6 +36,7 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower { MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const; MCOperand lowerTypeIndexOperand(SmallVectorImpl<wasm::ValType> &&, SmallVectorImpl<wasm::ValType> &&) const; + MCOperand lowerEncodedFunctionSignature(const APInt &Sig) const; public: WebAssemblyMCInstLower(MCContext &ctx, WebAssemblyAsmPrinter &printer) diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index b642c1c..8213e51 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -1042,8 +1042,8 @@ private: } PrevState = CurrState; } - void onRParen() { - PrevState = State; + bool onRParen(StringRef &ErrMsg) { + IntelExprState CurrState = State; switch (State) { default: State = IES_ERROR; @@ -1054,9 +1054,27 @@ private: case IES_RBRAC: case IES_RPAREN: State = IES_RPAREN; + // In the case of a multiply, onRegister has already set IndexReg + // directly, with appropriate scale. + // Otherwise if we just saw a register it has only been stored in + // TmpReg, so we need to store it into the state machine. + if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) { + // If we already have a BaseReg, then assume this is the IndexReg with + // no explicit scale. + if (!BaseReg) { + BaseReg = TmpReg; + } else { + if (IndexReg) + return regsUseUpError(ErrMsg); + IndexReg = TmpReg; + Scale = 0; + } + } IC.pushOperator(IC_RPAREN); break; } + PrevState = CurrState; + return false; } bool onOffset(const MCExpr *Val, SMLoc OffsetLoc, StringRef ID, const InlineAsmIdentifierInfo &IDInfo, @@ -2172,7 +2190,11 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { } break; case AsmToken::LParen: SM.onLParen(); break; - case AsmToken::RParen: SM.onRParen(); break; + case AsmToken::RParen: + if (SM.onRParen(ErrMsg)) { + return Error(Tok.getLoc(), ErrMsg); + } + break; } if (SM.hadError()) return Error(Tok.getLoc(), "unknown token in expression"); diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index f5eeb3b..d691538 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -11,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "X86MCAsmInfo.h" -#include "MCTargetDesc/X86MCExpr.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCStreamer.h" #include "llvm/Support/CommandLine.h" diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h index efb951b..e02b556 100644 --- a/llvm/lib/Target/X86/X86AsmPrinter.h +++ b/llvm/lib/Target/X86/X86AsmPrinter.h @@ -151,6 +151,7 @@ private: MCSymbol *LazyPointer) override; void emitCallInstruction(const llvm::MCInst &MCI); + void maybeEmitNopAfterCallForWindowsEH(const MachineInstr *MI); // Emits a label to mark the next instruction as being relevant to Import Call // Optimization. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6281124..11ab8dc 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5001,9 +5001,12 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, EVT VT = Op.getValueType(); unsigned SizeInBits = VT.getSizeInBits(); - assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!"); unsigned NumElts = SizeInBits / EltSizeInBits; + // Can't split constant. + if ((SizeInBits % EltSizeInBits) != 0) + return false; + // Bitcast a source array of element bits to the target size. auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) { unsigned NumSrcElts = UndefSrcElts.getBitWidth(); @@ -45059,6 +45062,10 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( unsigned NumElts = DemandedElts.getBitWidth(); switch (Op.getOpcode()) { + case X86ISD::GlobalBaseReg: + case X86ISD::Wrapper: + case X86ISD::WrapperRIP: + return true; case X86ISD::BLENDI: case X86ISD::PSHUFD: case X86ISD::UNPCKL: @@ -45098,27 +45105,34 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const { switch (Op.getOpcode()) { + // SSE vector insert/extracts use modulo indices. + case X86ISD::PINSRB: + case X86ISD::PINSRW: + case X86ISD::PEXTRB: + case X86ISD::PEXTRW: + return false; // SSE vector multiplies are either inbounds or saturate. case X86ISD::VPMADDUBSW: case X86ISD::VPMADDWD: + return false; // SSE vector shifts handle out of bounds shift amounts. case X86ISD::VSHLI: case X86ISD::VSRLI: case X86ISD::VSRAI: return false; - // SSE blends. + // SSE blends. case X86ISD::BLENDI: case X86ISD::BLENDV: return false; - // SSE target shuffles. + // SSE target shuffles. case X86ISD::PSHUFD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VPERMILPI: case X86ISD::VPERMV3: return false; - // SSE comparisons handle all icmp/fcmp cases. - // TODO: Add CMPM/MM with test coverage. + // SSE comparisons handle all icmp/fcmp cases. + // TODO: Add CMPM/MM with test coverage. case X86ISD::CMPP: case X86ISD::PCMPEQ: case X86ISD::PCMPGT: diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 2636979..547b221 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1668,7 +1668,8 @@ namespace llvm { /// Lower interleaved store(s) into target specific /// instructions/intrinsics. - bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, + bool lowerInterleavedStore(Instruction *Store, Value *Mask, + ShuffleVectorInst *SVI, unsigned Factor) const override; SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 360293bc..636b072 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -822,7 +822,8 @@ bool X86TargetLowering::lowerInterleavedLoad( return Grp.isSupported() && Grp.lowerIntoOptimizedSequence(); } -bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI, +bool X86TargetLowering::lowerInterleavedStore(Instruction *Store, + Value *LaneMask, ShuffleVectorInst *SVI, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && @@ -832,6 +833,11 @@ bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI, 0 && "Invalid interleaved store"); + auto *SI = dyn_cast<StoreInst>(Store); + if (!SI) + return false; + assert(!LaneMask && "Unexpected mask on store"); + // Holds the indices of SVI that correspond to the starting index of each // interleaved shuffle. auto Mask = SVI->getShuffleMask(); diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index 45d596b..481a9be 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -32,6 +32,7 @@ #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/StackMaps.h" +#include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Mangler.h" @@ -833,6 +834,7 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI, CallInst.setOpcode(CallOpcode); CallInst.addOperand(CallTargetMCOp); OutStreamer->emitInstruction(CallInst, getSubtargetInfo()); + maybeEmitNopAfterCallForWindowsEH(&MI); } // Record our statepoint node in the same section used by STACKMAP @@ -1430,21 +1432,6 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, OutStreamer->emitLabel(FallthroughLabel); } -// Returns instruction preceding MBBI in MachineFunction. -// If MBBI is the first instruction of the first basic block, returns null. -static MachineBasicBlock::const_iterator -PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) { - const MachineBasicBlock *MBB = MBBI->getParent(); - while (MBBI == MBB->begin()) { - if (MBB == &MBB->getParent()->front()) - return MachineBasicBlock::const_iterator(); - MBB = MBB->getPrevNode(); - MBBI = MBB->end(); - } - --MBBI; - return MBBI; -} - static unsigned getSrcIdx(const MachineInstr* MI, unsigned SrcIdx) { if (X86II::isKMasked(MI->getDesc().TSFlags)) { // Skip mask operand. @@ -2271,6 +2258,9 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { OutStreamer->AddComment("EVEX TO EVEX Compression ", false); } + // We use this to suppress NOP padding for Windows EH. + bool IsTailJump = false; + switch (MI->getOpcode()) { case TargetOpcode::DBG_VALUE: llvm_unreachable("Should be handled target independently"); @@ -2325,6 +2315,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { // Lower this as normal, but add a comment. OutStreamer->AddComment("TAILCALL"); + IsTailJump = true; break; case X86::TAILJMPr: @@ -2340,6 +2331,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { // Lower these as normal, but add some comments. OutStreamer->AddComment("TAILCALL"); + IsTailJump = true; break; case X86::TAILJMPm64_REX: @@ -2349,6 +2341,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { } OutStreamer->AddComment("TAILCALL"); + IsTailJump = true; break; case X86::TAILJMPr64_REX: { @@ -2361,6 +2354,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { } OutStreamer->AddComment("TAILCALL"); + IsTailJump = true; break; } @@ -2537,26 +2531,6 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { case X86::SEH_BeginEpilogue: { assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); - // Windows unwinder will not invoke function's exception handler if IP is - // either in prologue or in epilogue. This behavior causes a problem when a - // call immediately precedes an epilogue, because the return address points - // into the epilogue. To cope with that, we insert a 'nop' if it ends up - // immediately after a CALL in the final emitted code. - MachineBasicBlock::const_iterator MBBI(MI); - // Check if preceded by a call and emit nop if so. - for (MBBI = PrevCrossBBInst(MBBI); - MBBI != MachineBasicBlock::const_iterator(); - MBBI = PrevCrossBBInst(MBBI)) { - // Pseudo instructions that aren't a call are assumed to not emit any - // code. If they do, we worst case generate unnecessary noops after a - // call. - if (MBBI->isCall() || !MBBI->isPseudo()) { - if (MBBI->isCall()) - EmitAndCountInstruction(MCInstBuilder(X86::NOOP)); - break; - } - } - EmitSEHInstruction(MI); return; } @@ -2585,6 +2559,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX)); emitCallInstruction(TmpInst); emitNop(*OutStreamer, 5, Subtarget); + maybeEmitNopAfterCallForWindowsEH(MI); return; } @@ -2605,6 +2580,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { // For Import Call Optimization to work, we need a 3-byte nop after the // call instruction. emitNop(*OutStreamer, 3, Subtarget); + maybeEmitNopAfterCallForWindowsEH(MI); return; } break; @@ -2638,6 +2614,10 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { if (MI->isCall()) { emitCallInstruction(TmpInst); + // Since tail calls transfer control without leaving a stack frame, there is + // never a need for NOP padding tail calls. + if (!IsTailJump) + maybeEmitNopAfterCallForWindowsEH(MI); return; } @@ -2659,6 +2639,164 @@ void X86AsmPrinter::emitCallInstruction(const llvm::MCInst &MCI) { OutStreamer->emitInstruction(MCI, getSubtargetInfo()); } +// Determines whether a NOP is required after a CALL, so that Windows EH +// IP2State tables have the correct information. +// +// On most Windows platforms (AMD64, ARM64, ARM32, IA64, but *not* x86-32), +// exception handling works by looking up instruction pointers in lookup +// tables. These lookup tables are stored in .xdata sections in executables. +// One element of the lookup tables are the "IP2State" tables (Instruction +// Pointer to State). +// +// If a function has any instructions that require cleanup during exception +// unwinding, then it will have an IP2State table. Each entry in the IP2State +// table describes a range of bytes in the function's instruction stream, and +// associates an "EH state number" with that range of instructions. A value of +// -1 means "the null state", which does not require any code to execute. +// A value other than -1 is an index into the State table. +// +// The entries in the IP2State table contain byte offsets within the instruction +// stream of the function. The Windows ABI requires that these offsets are +// aligned to instruction boundaries; they are not permitted to point to a byte +// that is not the first byte of an instruction. +// +// Unfortunately, CALL instructions present a problem during unwinding. CALL +// instructions push the address of the instruction after the CALL instruction, +// so that execution can resume after the CALL. If the CALL is the last +// instruction within an IP2State region, then the return address (on the stack) +// points to the *next* IP2State region. This means that the unwinder will +// use the wrong cleanup funclet during unwinding. +// +// To fix this problem, the Windows AMD64 ABI requires that CALL instructions +// are never placed at the end of an IP2State region. Stated equivalently, the +// end of a CALL instruction cannot be aligned to an IP2State boundary. If a +// CALL instruction would occur at the end of an IP2State region, then the +// compiler must insert a NOP instruction after the CALL. The NOP instruction +// is placed in the same EH region as the CALL instruction, so that the return +// address points to the NOP and the unwinder will locate the correct region. +// +// NOP padding is only necessary on Windows AMD64 targets. On ARM64 and ARM32, +// instructions have a fixed size so the unwinder knows how to "back up" by +// one instruction. +// +// Interaction with Import Call Optimization (ICO): +// +// Import Call Optimization (ICO) is a compiler + OS feature on Windows which +// improves the performance and security of DLL imports. ICO relies on using a +// specific CALL idiom that can be replaced by the OS DLL loader. This removes +// a load and indirect CALL and replaces it with a single direct CALL. +// +// To achieve this, ICO also inserts NOPs after the CALL instruction. If the +// end of the CALL is aligned with an EH state transition, we *also* insert +// a single-byte NOP. **Both forms of NOPs must be preserved.** They cannot +// be combined into a single larger NOP; nor can the second NOP be removed. +// +// This is necessary because, if ICO is active and the call site is modified +// by the loader, the loader will end up overwriting the NOPs that were inserted +// for ICO. That means that those NOPs cannot be used for the correct +// termination of the exception handling region (the IP2State transition), +// so we still need an additional NOP instruction. The NOPs cannot be combined +// into a longer NOP (which is ordinarily desirable) because then ICO would +// split one instruction, producing a malformed instruction after the ICO call. +void X86AsmPrinter::maybeEmitNopAfterCallForWindowsEH(const MachineInstr *MI) { + // We only need to insert NOPs after CALLs when targeting Windows on AMD64. + // (Don't let the name fool you: Itanium refers to table-based exception + // handling, not the Itanium architecture.) + if (MAI->getExceptionHandlingType() != ExceptionHandling::WinEH || + MAI->getWinEHEncodingType() != WinEH::EncodingType::Itanium) { + return; + } + + bool HasEHPersonality = MF->getWinEHFuncInfo() != nullptr; + + // Set up MBB iterator, initially positioned on the same MBB as MI. + MachineFunction::const_iterator MFI(MI->getParent()); + MachineFunction::const_iterator MFE(MF->end()); + + // Set up instruction iterator, positioned immediately *after* MI. + MachineBasicBlock::const_iterator MBBI(MI); + MachineBasicBlock::const_iterator MBBE = MI->getParent()->end(); + ++MBBI; // Step over MI + + // This loop iterates MBBs + for (;;) { + // This loop iterates instructions + for (; MBBI != MBBE; ++MBBI) { + // Check the instruction that follows this CALL. + const MachineInstr &NextMI = *MBBI; + + // If there is an EH_LABEL after this CALL, then there is an EH state + // transition after this CALL. This is exactly the situation which + // requires NOP padding. + if (NextMI.isEHLabel()) { + if (HasEHPersonality) { + EmitAndCountInstruction(MCInstBuilder(X86::NOOP)); + return; + } + // We actually want to continue, in case there is an SEH_BeginEpilogue + // instruction after the EH_LABEL. In some situations, IR is produced + // that contains EH_LABEL pseudo-instructions, even when we are not + // generating IP2State tables. We still need to insert a NOP before + // SEH_BeginEpilogue in that case. + continue; + } + + // Somewhat similarly, if the CALL is the last instruction before the + // SEH prologue, then we also need a NOP. This is necessary because the + // Windows stack unwinder will not invoke a function's exception handler + // if the instruction pointer is in the function prologue or epilogue. + // + // We always emit a NOP before SEH_BeginEpilogue, even if there is no + // personality function (unwind info) for this frame. This is the same + // behavior as MSVC. + if (NextMI.getOpcode() == X86::SEH_BeginEpilogue) { + EmitAndCountInstruction(MCInstBuilder(X86::NOOP)); + return; + } + + if (!NextMI.isPseudo() && !NextMI.isMetaInstruction()) { + // We found a real instruction. During the CALL, the return IP will + // point to this instruction. Since this instruction has the same EH + // state as the call itself (because there is no intervening EH_LABEL), + // the IP2State table will be accurate; there is no need to insert a + // NOP. + return; + } + + // The next instruction is a pseudo-op. Ignore it and keep searching. + // Because these instructions do not generate any machine code, they + // cannot prevent the IP2State table from pointing at the wrong + // instruction during a CALL. + } + + // We've reached the end of this MBB. Find the next MBB in program order. + // MBB order should be finalized by this point, so falling across MBBs is + // expected. + ++MFI; + if (MFI == MFE) { + // No more blocks; we've reached the end of the function. This should + // only happen with no-return functions, but double-check to be sure. + if (HasEHPersonality) { + // If the CALL has no successors, then it is a noreturn function. + // Insert an INT3 instead of a NOP. This accomplishes the same purpose, + // but is more clear to read. Also, analysis tools will understand + // that they should not continue disassembling after the CALL (unless + // there are other branches to that label). + if (MI->getParent()->succ_empty()) + EmitAndCountInstruction(MCInstBuilder(X86::INT3)); + else + EmitAndCountInstruction(MCInstBuilder(X86::NOOP)); + } + return; + } + + // Set up iterator to scan the next basic block. + const MachineBasicBlock *NextMBB = &*MFI; + MBBI = NextMBB->instr_begin(); + MBBE = NextMBB->instr_end(); + } +} + void X86AsmPrinter::emitLabelAndRecordForImportCallOptimization( ImportCallKind Kind) { assert(EnableImportCallOptimization); |