diff options
Diffstat (limited to 'llvm/lib/Target/X86')
| -rw-r--r-- | llvm/lib/Target/X86/X86.h | 26 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86CompressEVEX.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 4 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 212 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp | 48 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86PartialReduction.cpp | 72 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86PassRegistry.def | 4 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86TargetMachine.cpp | 6 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86VZeroUpper.cpp | 2 |
9 files changed, 241 insertions, 135 deletions
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 51b540a..fa23656 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -158,7 +158,16 @@ FunctionPass *createX86InsertX87waitPass(); /// This pass optimizes arithmetic based on knowledge that is only used by /// a reduction sequence and is therefore safe to reassociate in interesting /// ways. -FunctionPass *createX86PartialReductionPass(); +class X86PartialReductionPass : public PassInfoMixin<X86PartialReductionPass> { +private: + const X86TargetMachine *TM; + +public: + X86PartialReductionPass(const X86TargetMachine *TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); +}; + +FunctionPass *createX86PartialReductionLegacyPass(); /// // Analyzes and emits pseudos to support Win x64 Unwind V2. FunctionPass *createX86WinEHUnwindV2Pass(); @@ -179,7 +188,18 @@ FunctionPass *createX86LowerAMXTypeLegacyPass(); /// The pass transforms amx intrinsics to scalar operation if the function has /// optnone attribute or it is O0. -FunctionPass *createX86LowerAMXIntrinsicsPass(); +class X86LowerAMXIntrinsicsPass + : public PassInfoMixin<X86LowerAMXIntrinsicsPass> { +private: + const TargetMachine *TM; + +public: + X86LowerAMXIntrinsicsPass(const TargetMachine *TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); + static bool isRequired() { return true; } +}; + +FunctionPass *createX86LowerAMXIntrinsicsLegacyPass(); InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM, const X86Subtarget &, @@ -220,7 +240,7 @@ void initializeX86LowerAMXIntrinsicsLegacyPassPass(PassRegistry &); void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &); void initializeX86LowerTileCopyPass(PassRegistry &); void initializeX86OptimizeLEAPassPass(PassRegistry &); -void initializeX86PartialReductionPass(PassRegistry &); +void initializeX86PartialReductionLegacyPass(PassRegistry &); void initializeX86PreTileConfigPass(PassRegistry &); void initializeX86ReturnThunksPass(PassRegistry &); void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &); diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp index c0c7f5a..ddbd10d 100644 --- a/llvm/lib/Target/X86/X86CompressEVEX.cpp +++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp @@ -272,7 +272,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB, const MachineOperand &Src2 = MI.getOperand(2); bool Is32BitReg = Opc == X86::ADD32ri_ND || Opc == X86::ADD32rr_ND; const MCInstrDesc &NewDesc = - ST.getInstrInfo()->get(Is32BitReg ? X86::LEA32r : X86::LEA64r); + ST.getInstrInfo()->get(Is32BitReg ? X86::LEA64_32r : X86::LEA64r); if (Is32BitReg) Src1 = getX86SubSuperRegister(Src1, 64); MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), NewDesc, Dst) diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index d4418c8..6c16fcfb 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4728,9 +4728,9 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) { auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) { if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() && ISD::isBuildVectorAllOnes(Op->getOperand(1).getNode())) { - SDValue InnerOp = Op->getOperand(0); + SDValue InnerOp = getFoldableLogicOp(Op->getOperand(0)); - if (!getFoldableLogicOp(InnerOp)) + if (!InnerOp) return SDValue(); N0 = InnerOp.getOperand(0); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b97b508..d103953 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2572,8 +2572,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // Combine sin / cos into _sincos_stret if it is available. - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); if (Subtarget.isTargetWin64()) { setOperationAction(ISD::SDIV, MVT::i128, Custom); @@ -30908,6 +30908,63 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR); } + if (VT == MVT::v64i8 && Subtarget.canExtendTo512BW()) { + // On AVX512BW, we can use variable 16-bit shifts to implement variable + // 8-bit shifts. For this, we split the input into two vectors, RLo and RHi. + // The i-th lane of RLo contains the (2*i)-th lane of R, and the i-th lane + // of RHi contains the (2*i+1)-th lane of R. After shifting, these vectors + // can efficiently be merged together using a masked move. + MVT ExtVT = MVT::v32i16; + + SDValue RLo, RHi; + // Isolate lower and upper lanes of Amt by masking odd lanes in AmtLo and + // right shifting AmtHi. + SDValue AmtLo = DAG.getNode(ISD::AND, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), + DAG.getConstant(0x00ff, dl, ExtVT)); + SDValue AmtHi = getTargetVShiftByConstNode( + X86ISD::VSRLI, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), 8, DAG); + switch (Opc) { + case ISD::SHL: + // Because we shift left, no bits from the high half can influence the low + // half, so we don't need to mask RLo. We do however need to mask RHi, to + // prevent high bits of an even lane overflowing into low bits of an odd + // lane. + RLo = DAG.getBitcast(ExtVT, R); + RHi = DAG.getNode(ISD::AND, dl, ExtVT, RLo, + DAG.getConstant(0xff00, dl, ExtVT)); + break; + case ISD::SRL: + // Same idea as above, but this time we need to make sure no low bits of + // an odd lane can overflow into high bits of an even lane. + RHi = DAG.getBitcast(ExtVT, R); + RLo = DAG.getNode(ISD::AND, dl, ExtVT, RHi, + DAG.getConstant(0x00ff, dl, ExtVT)); + break; + case ISD::SRA: + // For arithmetic right shifts, we want to sign extend each even lane of R + // such that the upper half of the corresponding lane of RLo is 0 or -1 + // depending on the sign bit of the original lane. We do this using 2 + // immediate shifts. + RHi = DAG.getBitcast(ExtVT, R); + RLo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, RHi, 8, DAG); + RLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExtVT, RLo, 8, DAG); + break; + default: + llvm_unreachable("Unexpected Shift Op"); + } + + SDValue ShiftedLo = + DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RLo, AmtLo)); + SDValue ShiftedHi = + DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RHi, AmtHi)); + + // To merge the shifted vectors back together, we select even lanes + // from ShiftedLo and odd lanes from ShiftedHi. + SDValue SelectMask = DAG.getBitcast( + MVT::v64i1, DAG.getConstant(0x5555555555555555, dl, MVT::i64)); + return DAG.getSelect(dl, VT, SelectMask, ShiftedLo, ShiftedHi); + } + if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) { @@ -33004,61 +33061,6 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } -static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue Arg = Op.getOperand(0); - EVT ArgVT = Arg.getValueType(); - bool isF64 = ArgVT == MVT::f64; - - RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; - const char *LibcallName = TLI.getLibcallName(LC); - if (!LibcallName) - return SDValue(); - - assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit()); - - // For MacOSX, we want to call an alternative entry point: __sincos_stret, - // which returns the values as { float, float } (in XMM0) or - // { double, double } (which is returned in XMM0, XMM1). - SDLoc dl(Op); - Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - - TargetLowering::ArgListTy Args; - Args.emplace_back(Arg, ArgTy); - - // Only optimize x86_64 for now. i386 is a bit messy. For f32, - // the small struct {f32, f32} is returned in (eax, edx). For f64, - // the results are returned via SRet in memory. - SDValue Callee = - DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); - - Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy) - : (Type *)FixedVectorType::get(ArgTy, 2); - - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl) - .setChain(DAG.getEntryNode()) - .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args)) - .setIsPostTypeLegalization(); - - std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); - - if (isF64) - // Returned in xmm0 and xmm1. - return CallResult.first; - - // Returned in bits 0:31 and 32:64 xmm0. - SDValue SinVal = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, - DAG.getVectorIdxConstant(0, dl)); - SDValue CosVal = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, - DAG.getVectorIdxConstant(1, dl)); - SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); - return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); -} - /// Widen a vector input to a vector of NVT. The /// input vector must have the same element type as NVT. static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, @@ -33663,7 +33665,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ABDS: case ISD::ABDU: return LowerABD(Op, Subtarget, DAG); case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG); - case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); @@ -53349,40 +53350,45 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, } // Look for a RMW operation that only touches one bit of a larger than legal -// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value. +// type and fold it to a BTC/BTR/BTS or bit insertion pattern acting on a single +// i32 sub value. static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { using namespace SDPatternMatch; - - // Only handle normal stores and its chain was a matching normal load. - auto *Ld = dyn_cast<LoadSDNode>(St->getChain()); - if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld || - !ISD::isNormalLoad(Ld) || !Ld->isSimple() || - Ld->getBasePtr() != St->getBasePtr() || - Ld->getOffset() != St->getOffset()) - return SDValue(); - - SDValue LoadVal(Ld, 0); SDValue StoredVal = St->getValue(); EVT VT = StoredVal.getValueType(); - // Only narrow larger than legal scalar integers. - if (!VT.isScalarInteger() || + // Only narrow normal stores of larger than legal scalar integers. + if (!ISD::isNormalStore(St) || !St->isSimple() || !VT.isScalarInteger() || VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32)) return SDValue(); // BTR: X & ~(1 << ShAmt) // BTS: X | (1 << ShAmt) // BTC: X ^ (1 << ShAmt) - SDValue ShAmt; - if (!StoredVal.hasOneUse() || - !(sd_match(StoredVal, m_And(m_Specific(LoadVal), + // + // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt) + SDValue SrcVal, InsertBit, ShAmt; + if (!(sd_match(StoredVal, m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt))))) || sd_match(StoredVal, - m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) || + m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) || sd_match(StoredVal, - m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))))) + m_Xor(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) || + sd_match( + StoredVal, + m_Or(m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt)))), + m_Shl(m_Value(InsertBit), m_Deferred(ShAmt)))))) + return SDValue(); + + // SrcVal must be a matching normal load further up the chain. + auto *Ld = dyn_cast<LoadSDNode>(SrcVal); + if (!Ld || !ISD::isNormalLoad(Ld) || !Ld->isSimple() || + Ld->getBasePtr() != St->getBasePtr() || + Ld->getOffset() != St->getOffset() || + !St->getChain().reachesChainWithoutSideEffects(SDValue(Ld, 1))) return SDValue(); // Ensure the shift amount is in bounds. @@ -53390,6 +53396,13 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, if (KnownAmt.getMaxValue().uge(VT.getSizeInBits())) return SDValue(); + // If we're inserting a bit then it must be the LSB. + if (InsertBit) { + KnownBits KnownInsert = DAG.computeKnownBits(InsertBit); + if (KnownInsert.countMinLeadingZeros() < (VT.getSizeInBits() - 1)) + return SDValue(); + } + // Split the shift into an alignment shift that moves the active i32 block to // the bottom bits for truncation and a modulo shift that can act on the i32. EVT AmtVT = ShAmt.getValueType(); @@ -53397,6 +53410,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, DAG.getSignedConstant(-32LL, DL, AmtVT)); SDValue ModuloAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT)); + ModuloAmt = DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8); // Compute the byte offset for the i32 block that is changed by the RMW. // combineTruncate will adjust the load for us in a similar way. @@ -53408,18 +53422,41 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, SDNodeFlags::NoUnsignedWrap); // Reconstruct the BTC/BTR/BTS pattern for the i32 block and store. - SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt); + SDValue X = DAG.getNode(ISD::SRL, DL, VT, SrcVal, AlignAmt); X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); - SDValue Mask = - DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), - DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8)); - if (StoredVal.getOpcode() == ISD::AND) - Mask = DAG.getNOT(DL, Mask, MVT::i32); + SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getConstant(1, DL, MVT::i32), ModuloAmt); - SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); - return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(), - Align(), St->getMemOperand()->getFlags()); + SDValue Res; + if (InsertBit) { + SDValue BitMask = + DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getZExtOrTrunc(InsertBit, DL, MVT::i32), ModuloAmt); + Res = + DAG.getNode(ISD::AND, DL, MVT::i32, X, DAG.getNOT(DL, Mask, MVT::i32)); + Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, BitMask); + } else { + if (StoredVal.getOpcode() == ISD::AND) + Mask = DAG.getNOT(DL, Mask, MVT::i32); + Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); + } + + SDValue NewStore = + DAG.getStore(St->getChain(), DL, Res, NewPtr, + MachinePointerInfo(St->getPointerInfo().getAddrSpace()), + Align(), St->getMemOperand()->getFlags()); + + // If there are other uses of StoredVal, replace with a new load of the + // whole (updated) value. + if (!StoredVal.hasOneUse()) { + SDValue NewLoad = + DAG.getLoad(VT, DL, NewStore, Ld->getBasePtr(), Ld->getMemOperand()); + for (SDNode *User : StoredVal->users()) + DCI.AddToWorklist(User); + DAG.ReplaceAllUsesWith(StoredVal, NewLoad); + } + return NewStore; } static SDValue combineStore(SDNode *N, SelectionDAG &DAG, @@ -53648,7 +53685,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, } } - if (SDValue R = narrowBitOpRMW(St, dl, DAG, Subtarget)) + if (SDValue R = narrowBitOpRMW(St, dl, DAG, DCI, Subtarget)) return R; // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC) @@ -54606,7 +54643,8 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, SDValue NewPtr = DAG.getMemBasePlusOffset( Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap); SDValue NewLoad = - DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(), + DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, + MachinePointerInfo(Ld->getPointerInfo().getAddrSpace()), Align(), Ld->getMemOperand()->getFlags()); DAG.makeEquivalentMemoryOrdering(Ld, NewLoad); return NewLoad; diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp index 7f33939..662aec2 100644 --- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp @@ -23,12 +23,15 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Analysis.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -40,7 +43,7 @@ using namespace llvm; using namespace PatternMatch; -#define DEBUG_TYPE "lower-amx-intrinsics" +#define DEBUG_TYPE "x86-lower-amx-intrinsics" #ifndef NDEBUG static bool isV256I32Ty(Type *Ty) { @@ -627,6 +630,37 @@ bool X86LowerAMXIntrinsics::visit() { } namespace { +bool shouldRunLowerAMXIntrinsics(const Function &F, const TargetMachine *TM) { + return X86ScalarizeAMX && (F.hasFnAttribute(Attribute::OptimizeNone) || + TM->getOptLevel() == CodeGenOptLevel::None); +} + +bool runLowerAMXIntrinsics(Function &F, DominatorTree *DT, LoopInfo *LI) { + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + + X86LowerAMXIntrinsics LAT(F, DTU, LI); + return LAT.visit(); +} +} // namespace + +PreservedAnalyses X86LowerAMXIntrinsicsPass::run(Function &F, + FunctionAnalysisManager &FAM) { + if (!shouldRunLowerAMXIntrinsics(F, TM)) + return PreservedAnalyses::all(); + + DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); + LoopInfo &LI = FAM.getResult<LoopAnalysis>(F); + bool Changed = runLowerAMXIntrinsics(F, &DT, &LI); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = PreservedAnalyses::none(); + PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<LoopAnalysis>(); + return PA; +} + +namespace { class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass { public: static char ID; @@ -634,21 +668,15 @@ public: X86LowerAMXIntrinsicsLegacyPass() : FunctionPass(ID) {} bool runOnFunction(Function &F) override { - if (!X86ScalarizeAMX) - return false; TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); - if (!F.hasFnAttribute(Attribute::OptimizeNone) && - TM->getOptLevel() != CodeGenOptLevel::None) + if (!shouldRunLowerAMXIntrinsics(F, TM)) return false; auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); - - X86LowerAMXIntrinsics LAT(F, DTU, LI); - return LAT.visit(); + return runLowerAMXIntrinsics(F, DT, LI); } StringRef getPassName() const override { return "Lower AMX intrinsics"; } @@ -668,6 +696,6 @@ INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName, false, false) -FunctionPass *llvm::createX86LowerAMXIntrinsicsPass() { +FunctionPass *llvm::createX86LowerAMXIntrinsicsLegacyPass() { return new X86LowerAMXIntrinsicsLegacyPass(); } diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp index a25e4e0..898c83c 100644 --- a/llvm/lib/Target/X86/X86PartialReduction.cpp +++ b/llvm/lib/Target/X86/X86PartialReduction.cpp @@ -16,10 +16,12 @@ #include "X86TargetMachine.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Analysis.h" #include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" #include "llvm/Support/KnownBits.h" @@ -30,39 +32,44 @@ using namespace llvm; namespace { -class X86PartialReduction : public FunctionPass { +class X86PartialReduction { + const X86TargetMachine *TM; const DataLayout *DL = nullptr; const X86Subtarget *ST = nullptr; public: + X86PartialReduction(const X86TargetMachine *TM) : TM(TM) {} + bool run(Function &F); + +private: + bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB); + bool trySADReplacement(Instruction *Op); +}; + +class X86PartialReductionLegacy : public FunctionPass { +public: static char ID; // Pass identification, replacement for typeid. - X86PartialReduction() : FunctionPass(ID) { } + X86PartialReductionLegacy() : FunctionPass(ID) {} - bool runOnFunction(Function &Fn) override; + bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); } - StringRef getPassName() const override { - return "X86 Partial Reduction"; - } - -private: - bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB); - bool trySADReplacement(Instruction *Op); + StringRef getPassName() const override { return "X86 Partial Reduction"; } }; } -FunctionPass *llvm::createX86PartialReductionPass() { - return new X86PartialReduction(); +FunctionPass *llvm::createX86PartialReductionLegacyPass() { + return new X86PartialReductionLegacy(); } -char X86PartialReduction::ID = 0; +char X86PartialReductionLegacy::ID = 0; -INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE, - "X86 Partial Reduction", false, false) +INITIALIZE_PASS(X86PartialReductionLegacy, DEBUG_TYPE, "X86 Partial Reduction", + false, false) // This function should be aligned with detectExtMul() in X86ISelLowering.cpp. static bool matchVPDPBUSDPattern(const X86Subtarget *ST, BinaryOperator *Mul, @@ -494,17 +501,8 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) { } } -bool X86PartialReduction::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - - auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); - if (!TPC) - return false; - - auto &TM = TPC->getTM<X86TargetMachine>(); - ST = TM.getSubtargetImpl(F); - +bool X86PartialReduction::run(Function &F) { + ST = TM->getSubtargetImpl(F); DL = &F.getDataLayout(); bool MadeChange = false; @@ -540,3 +538,25 @@ bool X86PartialReduction::runOnFunction(Function &F) { return MadeChange; } + +bool X86PartialReductionLegacy::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + return false; + + return X86PartialReduction(&TPC->getTM<X86TargetMachine>()).run(F); +} + +PreservedAnalyses X86PartialReductionPass::run(Function &F, + FunctionAnalysisManager &FAM) { + bool Changed = X86PartialReduction(TM).run(F); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = PreservedAnalyses::none(); + PA.preserveSet<CFGAnalyses>(); + return PA; +} diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def index fc25d55..db25594 100644 --- a/llvm/lib/Target/X86/X86PassRegistry.def +++ b/llvm/lib/Target/X86/X86PassRegistry.def @@ -15,14 +15,14 @@ #ifndef FUNCTION_PASS #define FUNCTION_PASS(NAME, CREATE_PASS) #endif +FUNCTION_PASS("x86-lower-amx-intrinsics", X86LowerAMXIntrinsicsPass(this)) FUNCTION_PASS("x86-lower-amx-type", X86LowerAMXTypePass(this)) +FUNCTION_PASS("x86-partial-reduction", X86PartialReductionPass(this)) #undef FUNCTION_PASS #ifndef DUMMY_FUNCTION_PASS #define DUMMY_FUNCTION_PASS(NAME, CREATE_PASS) #endif -DUMMY_FUNCTION_PASS("lower-amx-intrinsics", X86LowerAMXIntrinsics(*this)) -DUMMY_FUNCTION_PASS("x86-partial-reduction", X86PartialReduction()) DUMMY_FUNCTION_PASS("x86-winehstate", WinEHStatePass()) #undef DUMMY_FUNCTION_PASS diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 9a76abc..5f0bcab 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -97,7 +97,7 @@ extern "C" LLVM_C_ABI void LLVMInitializeX86Target() { initializeX86LoadValueInjectionLoadHardeningPassPass(PR); initializeX86LoadValueInjectionRetHardeningPassPass(PR); initializeX86OptimizeLEAPassPass(PR); - initializeX86PartialReductionPass(PR); + initializeX86PartialReductionLegacyPass(PR); initializePseudoProbeInserterPass(PR); initializeX86ReturnThunksPass(PR); initializeX86DAGToDAGISelLegacyPass(PR); @@ -422,14 +422,14 @@ void X86PassConfig::addIRPasses() { // We add both pass anyway and when these two passes run, we skip the pass // based on the option level and option attribute. - addPass(createX86LowerAMXIntrinsicsPass()); + addPass(createX86LowerAMXIntrinsicsLegacyPass()); addPass(createX86LowerAMXTypeLegacyPass()); TargetPassConfig::addIRPasses(); if (TM->getOptLevel() != CodeGenOptLevel::None) { addPass(createInterleavedAccessPass()); - addPass(createX86PartialReductionPass()); + addPass(createX86PartialReductionLegacyPass()); } // Add passes that handle indirect branch removal and insertion of a retpoline diff --git a/llvm/lib/Target/X86/X86VZeroUpper.cpp b/llvm/lib/Target/X86/X86VZeroUpper.cpp index f6f7e92..2f28ab3 100644 --- a/llvm/lib/Target/X86/X86VZeroUpper.cpp +++ b/llvm/lib/Target/X86/X86VZeroUpper.cpp @@ -66,7 +66,7 @@ namespace { MachineBasicBlock &MBB); void addDirtySuccessor(MachineBasicBlock &MBB); - using BlockExitState = enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY }; + enum BlockExitState { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY }; static const char* getBlockExitStateName(BlockExitState ST); |
