diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 12 | ||||
-rw-r--r-- | llvm/lib/IR/Verifier.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 178 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64InstrInfo.td | 20 | ||||
-rw-r--r-- | llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp | 17 | ||||
-rw-r--r-- | llvm/lib/Target/LoongArch/LoongArchISelLowering.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86FixupInstTuning.cpp | 54 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 22 | ||||
-rw-r--r-- | llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp | 50 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 16 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.cpp | 7 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanHelpers.h | 16 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 118 |
13 files changed, 394 insertions, 123 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index dba5a8c..cc503d3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -7492,7 +7492,6 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, // Pre-increment recursion depth for use in recursive calls. ++Depth; const SDNodeFlags Flags = Op->getFlags(); - const TargetOptions &Options = DAG.getTarget().Options; EVT VT = Op.getValueType(); unsigned Opcode = Op.getOpcode(); @@ -7572,7 +7571,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, return DAG.getBuildVector(VT, DL, Ops); } case ISD::FADD: { - if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros()) + if (!Flags.hasNoSignedZeros()) break; // After operation legalization, it might not be legal to create new FSUBs. @@ -7617,7 +7616,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, } case ISD::FSUB: { // We can't turn -(A-B) into B-A when we honor signed zeros. - if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros()) + if (!Flags.hasNoSignedZeros()) break; SDValue X = Op.getOperand(0), Y = Op.getOperand(1); @@ -7678,7 +7677,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, } case ISD::FMA: case ISD::FMAD: { - if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros()) + if (!Flags.hasNoSignedZeros()) break; SDValue X = Op.getOperand(0), Y = Op.getOperand(1), Z = Op.getOperand(2); @@ -8797,7 +8796,6 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node, EVT VT = Node->getValueType(0); EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); bool IsMax = Opc == ISD::FMAXIMUMNUM; - const TargetOptions &Options = DAG.getTarget().Options; SDNodeFlags Flags = Node->getFlags(); unsigned NewOp = @@ -8858,8 +8856,8 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node, // TODO: We need quiet sNaN if strictfp. // Fixup signed zero behavior. - if (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros() || - DAG.isKnownNeverZeroFloat(LHS) || DAG.isKnownNeverZeroFloat(RHS)) { + if (Flags.hasNoSignedZeros() || DAG.isKnownNeverZeroFloat(LHS) || + DAG.isKnownNeverZeroFloat(RHS)) { return MinMax; } SDValue TestZero = diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index b2e76cc..8c03d6f 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -5869,9 +5869,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { break; } case Intrinsic::call_preallocated_setup: { - auto *NumArgs = dyn_cast<ConstantInt>(Call.getArgOperand(0)); - Check(NumArgs != nullptr, - "llvm.call.preallocated.setup argument must be a constant"); + auto *NumArgs = cast<ConstantInt>(Call.getArgOperand(0)); bool FoundCall = false; for (User *U : Call.users()) { auto *UseCall = dyn_cast<CallBase>(U); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a4c1e26..899baa9 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8086,13 +8086,76 @@ static SDValue getZT0FrameIndex(MachineFrameInfo &MFI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); } +// Emit a call to __arm_sme_save or __arm_sme_restore. +static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, + SelectionDAG &DAG, + AArch64FunctionInfo *Info, SDLoc DL, + SDValue Chain, bool IsSave) { + MachineFunction &MF = DAG.getMachineFunction(); + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + FuncInfo->setSMESaveBufferUsed(); + TargetLowering::ArgListTy Args; + Args.emplace_back( + DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64), + PointerType::getUnqual(*DAG.getContext())); + + RTLIB::Libcall LC = + IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE; + SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), + TLI.getPointerTy(DAG.getDataLayout())); + auto *RetTy = Type::getVoidTy(*DAG.getContext()); + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( + TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)); + return TLI.LowerCallTo(CLI).second; +} + +static SDValue emitRestoreZALazySave(SDValue Chain, SDLoc DL, + const AArch64TargetLowering &TLI, + const AArch64RegisterInfo &TRI, + AArch64FunctionInfo &FuncInfo, + SelectionDAG &DAG) { + // Conditionally restore the lazy save using a pseudo node. + RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE; + TPIDR2Object &TPIDR2 = FuncInfo.getTPIDR2Obj(); + SDValue RegMask = DAG.getRegisterMask(TRI.getCallPreservedMask( + DAG.getMachineFunction(), TLI.getLibcallCallingConv(LC))); + SDValue RestoreRoutine = DAG.getTargetExternalSymbol( + TLI.getLibcallName(LC), TLI.getPointerTy(DAG.getDataLayout())); + SDValue TPIDR2_EL0 = DAG.getNode( + ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Chain, + DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); + // Copy the address of the TPIDR2 block into X0 before 'calling' the + // RESTORE_ZA pseudo. + SDValue Glue; + SDValue TPIDR2Block = DAG.getFrameIndex( + TPIDR2.FrameIndex, + DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); + Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, TPIDR2Block, Glue); + Chain = + DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other, + {Chain, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64), + RestoreRoutine, RegMask, Chain.getValue(1)}); + // Finally reset the TPIDR2_EL0 register to 0. + Chain = DAG.getNode( + ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, + DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i64)); + TPIDR2.Uses++; + return Chain; +} + SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL, SelectionDAG &DAG) const { assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value"); SDValue Glue = Chain.getValue(1); MachineFunction &MF = DAG.getMachineFunction(); - SMEAttrs SMEFnAttrs = MF.getInfo<AArch64FunctionInfo>()->getSMEFnAttrs(); + auto &FuncInfo = *MF.getInfo<AArch64FunctionInfo>(); + auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); + const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo(); + + SMEAttrs SMEFnAttrs = FuncInfo.getSMEFnAttrs(); // The following conditions are true on entry to an exception handler: // - PSTATE.SM is 0. @@ -8107,14 +8170,43 @@ SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL, // These mode changes are usually optimized away in catch blocks as they // occur before the __cxa_begin_catch (which is a non-streaming function), // but are necessary in some cases (such as for cleanups). + // + // Additionally, if the function has ZA or ZT0 state, we must restore it. + // [COND_]SMSTART SM if (SMEFnAttrs.hasStreamingInterfaceOrBody()) - return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, - /*Glue*/ Glue, AArch64SME::Always); + Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, + /*Glue*/ Glue, AArch64SME::Always); + else if (SMEFnAttrs.hasStreamingCompatibleInterface()) + Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue, + AArch64SME::IfCallerIsStreaming); - if (SMEFnAttrs.hasStreamingCompatibleInterface()) - return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue, - AArch64SME::IfCallerIsStreaming); + if (getTM().useNewSMEABILowering()) + return Chain; + + if (SMEFnAttrs.hasAgnosticZAInterface()) { + // Restore full ZA + Chain = emitSMEStateSaveRestore(*this, DAG, &FuncInfo, DL, Chain, + /*IsSave=*/false); + } else if (SMEFnAttrs.hasZAState() || SMEFnAttrs.hasZT0State()) { + // SMSTART ZA + Chain = DAG.getNode( + AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, + DAG.getTargetConstant(int32_t(AArch64SVCR::SVCRZA), DL, MVT::i32)); + + // Restore ZT0 + if (SMEFnAttrs.hasZT0State()) { + SDValue ZT0FrameIndex = + getZT0FrameIndex(MF.getFrameInfo(), FuncInfo, DAG); + Chain = + DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other), + {Chain, DAG.getConstant(0, DL, MVT::i32), ZT0FrameIndex}); + } + + // Restore ZA + if (SMEFnAttrs.hasZAState()) + Chain = emitRestoreZALazySave(Chain, DL, *this, TRI, FuncInfo, DAG); + } return Chain; } @@ -9232,30 +9324,6 @@ SDValue AArch64TargetLowering::changeStreamingMode( return GetCheckVL(SMChange.getValue(0), SMChange.getValue(1)); } -// Emit a call to __arm_sme_save or __arm_sme_restore. -static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, - SelectionDAG &DAG, - AArch64FunctionInfo *Info, SDLoc DL, - SDValue Chain, bool IsSave) { - MachineFunction &MF = DAG.getMachineFunction(); - AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); - FuncInfo->setSMESaveBufferUsed(); - TargetLowering::ArgListTy Args; - Args.emplace_back( - DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64), - PointerType::getUnqual(*DAG.getContext())); - - RTLIB::Libcall LC = - IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE; - SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), - TLI.getPointerTy(DAG.getDataLayout())); - auto *RetTy = Type::getVoidTy(*DAG.getContext()); - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( - TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)); - return TLI.LowerCallTo(CLI).second; -} - static AArch64SME::ToggleCondition getSMToggleCondition(const SMECallAttrs &CallAttrs) { if (!CallAttrs.caller().hasStreamingCompatibleInterface() || @@ -10015,33 +10083,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx}); if (RequiresLazySave) { - // Conditionally restore the lazy save using a pseudo node. - RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE; - TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); - SDValue RegMask = DAG.getRegisterMask( - TRI->getCallPreservedMask(MF, getLibcallCallingConv(LC))); - SDValue RestoreRoutine = DAG.getTargetExternalSymbol( - getLibcallName(LC), getPointerTy(DAG.getDataLayout())); - SDValue TPIDR2_EL0 = DAG.getNode( - ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result, - DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); - // Copy the address of the TPIDR2 block into X0 before 'calling' the - // RESTORE_ZA pseudo. - SDValue Glue; - SDValue TPIDR2Block = DAG.getFrameIndex( - TPIDR2.FrameIndex, - DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); - Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue); - Result = - DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other, - {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64), - RestoreRoutine, RegMask, Result.getValue(1)}); - // Finally reset the TPIDR2_EL0 register to 0. - Result = DAG.getNode( - ISD::INTRINSIC_VOID, DL, MVT::Other, Result, - DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i64)); - TPIDR2.Uses++; + Result = emitRestoreZALazySave(Result, DL, *this, *TRI, *FuncInfo, DAG); } else if (RequiresSaveAllZA) { Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result, /*IsSave=*/false); @@ -11736,6 +11778,28 @@ SDValue AArch64TargetLowering::LowerSELECT_CC( return DAG.getNode(ISD::AND, DL, VT, LHS, Shift); } + // Check for sign bit test patterns that can use TST optimization. + // (SELECT_CC setlt, sign_extend_inreg, 0, tval, fval) + // -> TST %operand, sign_bit; CSEL + // (SELECT_CC setlt, sign_extend, 0, tval, fval) + // -> TST %operand, sign_bit; CSEL + if (CC == ISD::SETLT && RHSC && RHSC->isZero() && LHS.hasOneUse() && + (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG || + LHS.getOpcode() == ISD::SIGN_EXTEND)) { + + uint64_t SignBitPos; + std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS); + EVT TestVT = LHS.getValueType(); + SDValue SignBitConst = DAG.getConstant(1ULL << SignBitPos, DL, TestVT); + SDValue TST = + DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(TestVT, MVT::i32), + LHS, SignBitConst); + + SDValue Flags = TST.getValue(1); + return DAG.getNode(AArch64ISD::CSEL, DL, TVal.getValueType(), TVal, FVal, + DAG.getConstant(AArch64CC::NE, DL, MVT::i32), Flags); + } + // Canonicalise absolute difference patterns: // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc -> // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 04b3c90..f788c75 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -9907,8 +9907,14 @@ def : Pat<(v4bf16 (bitconvert (v2f32 FPR64:$src))), def : Pat<(v4bf16 (bitconvert (v1f64 FPR64:$src))), (v4bf16 (REV64v4i16 FPR64:$src))>; } -def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>; -def : Pat<(v4bf16 (bitconvert (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), + (v4f16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (v4bf16 FPR64:$src))), + (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v4i16 FPR64:$src))), + (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v4f16 FPR64:$src))), + (v4bf16 FPR64:$src)>; let Predicates = [IsLE] in { def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), (v8i8 FPR64:$src)>; @@ -10236,8 +10242,14 @@ def : Pat<(v8bf16 (bitconvert (v2f64 FPR128:$src))), def : Pat<(v8bf16 (bitconvert (v4f32 FPR128:$src))), (v8bf16 (REV32v8i16 FPR128:$src))>; } -def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>; -def : Pat<(v8bf16 (bitconvert (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), + (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v8i16 FPR128:$src))), + (v8bf16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v8bf16 FPR128:$src))), + (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v8f16 FPR128:$src))), + (v8bf16 FPR128:$src)>; let Predicates = [IsLE] in { def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>; diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index ecd003c..098bcfa 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -9559,3 +9559,20 @@ bool LoongArchTargetLowering::shouldScalarizeBinop(SDValue VecOp) const { EVT ScalarVT = VecVT.getScalarType(); return isOperationLegalOrCustomOrPromote(Opc, ScalarVT); } + +bool LoongArchTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, + unsigned Index) const { + if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) + return false; + + // Extract a 128-bit subvector from index 0 of a 256-bit vector is free. + return Index == 0; +} + +bool LoongArchTargetLowering::isExtractVecEltCheap(EVT VT, + unsigned Index) const { + EVT EltVT = VT.getScalarType(); + + // Extract a scalar FP value from index 0 of a vector is free. + return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0; +} diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 3c00296..9b60a9f 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -338,6 +338,9 @@ public: unsigned Depth) const override; bool shouldScalarizeBinop(SDValue VecOp) const override; + bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, + unsigned Index) const override; + bool isExtractVecEltCheap(EVT VT, unsigned Index) const override; /// Check if a constant splat can be generated using [x]vldi, where imm[12] /// is 1. diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp index 33dc0a2..a1d4e0b 100644 --- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp +++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp @@ -277,6 +277,22 @@ bool X86FixupInstTuningPass::processInstruction( return true; }; + // Is ADD(X,X) more efficient than SHL(X,1)? + auto ProcessShiftLeftToAdd = [&](unsigned AddOpc) -> bool { + if (MI.getOperand(NumOperands - 1).getImm() != 1) + return false; + if (!NewOpcPreferable(AddOpc, /*ReplaceInTie*/ true)) + return false; + LLVM_DEBUG(dbgs() << "Replacing: " << MI); + { + MI.setDesc(TII->get(AddOpc)); + MI.removeOperand(NumOperands - 1); + MI.addOperand(MI.getOperand(NumOperands - 2)); + } + LLVM_DEBUG(dbgs() << " With: " << MI); + return false; + }; + switch (Opc) { case X86::BLENDPDrri: return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1); @@ -563,6 +579,44 @@ bool X86FixupInstTuningPass::processInstruction( return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmkz); case X86::VUNPCKHPSZrmkz: return ProcessUNPCKPS(X86::VPUNPCKHDQZrmkz); + + case X86::PSLLWri: + return ProcessShiftLeftToAdd(X86::PADDWrr); + case X86::VPSLLWri: + return ProcessShiftLeftToAdd(X86::VPADDWrr); + case X86::VPSLLWYri: + return ProcessShiftLeftToAdd(X86::VPADDWYrr); + case X86::VPSLLWZ128ri: + return ProcessShiftLeftToAdd(X86::VPADDWZ128rr); + case X86::VPSLLWZ256ri: + return ProcessShiftLeftToAdd(X86::VPADDWZ256rr); + case X86::VPSLLWZri: + return ProcessShiftLeftToAdd(X86::VPADDWZrr); + case X86::PSLLDri: + return ProcessShiftLeftToAdd(X86::PADDDrr); + case X86::VPSLLDri: + return ProcessShiftLeftToAdd(X86::VPADDDrr); + case X86::VPSLLDYri: + return ProcessShiftLeftToAdd(X86::VPADDDYrr); + case X86::VPSLLDZ128ri: + return ProcessShiftLeftToAdd(X86::VPADDDZ128rr); + case X86::VPSLLDZ256ri: + return ProcessShiftLeftToAdd(X86::VPADDDZ256rr); + case X86::VPSLLDZri: + return ProcessShiftLeftToAdd(X86::VPADDDZrr); + case X86::PSLLQri: + return ProcessShiftLeftToAdd(X86::PADDQrr); + case X86::VPSLLQri: + return ProcessShiftLeftToAdd(X86::VPADDQrr); + case X86::VPSLLQYri: + return ProcessShiftLeftToAdd(X86::VPADDQYrr); + case X86::VPSLLQZ128ri: + return ProcessShiftLeftToAdd(X86::VPADDQZ128rr); + case X86::VPSLLQZ256ri: + return ProcessShiftLeftToAdd(X86::VPADDQZ256rr); + case X86::VPSLLQZri: + return ProcessShiftLeftToAdd(X86::VPADDQZrr); + default: return false; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index efeddd7..e7eb67a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4456,8 +4456,8 @@ SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, bool AllowAVX512 = true) { assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2"); unsigned NumSubs = 1; - if ((CheckBWI && Subtarget.useBWIRegs()) || - (!CheckBWI && AllowAVX512 && Subtarget.useAVX512Regs())) { + if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) || + (!CheckBWI && Subtarget.useAVX512Regs()))) { if (VT.getSizeInBits() > 512) { NumSubs = VT.getSizeInBits() / 512; assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size"); @@ -30313,22 +30313,8 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, uint64_t ShiftAmt = APIntShiftAmt.getZExtValue(); - if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) { - // Hardware support for vector shifts is sparse which makes us scalarize the - // vector operations in many cases. Also, on sandybridge ADD is faster than - // shl: (shl V, 1) -> (add (freeze V), (freeze V)) - if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) { - // R may be undef at run-time, but (shl R, 1) must be an even number (LSB - // must be 0). (add undef, undef) however can be any value. To make this - // safe, we must freeze R to ensure that register allocation uses the same - // register for an undefined value. This ensures that the result will - // still be even and preserves the original semantics. - R = DAG.getFreeze(R); - return DAG.getNode(ISD::ADD, dl, VT, R, R); - } - + if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); - } // i64 SRA needs to be performed as partial shifts. if (((!Subtarget.hasXOP() && VT == MVT::v2i64) || @@ -46211,7 +46197,7 @@ static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, SDValue Zero = DAG.getConstant(0, DL, DpVT); return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1}, - DpBuilder, false); + DpBuilder, /*CheckBWI=*/false, Subtarget.hasVNNI()); } // Create a PSADBW given two sources representable as zexts of vXi8. diff --git a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp index c215228..89980d5 100644 --- a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp +++ b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/DropUnnecessaryAssumes.h" +#include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IntrinsicInst.h" @@ -17,13 +18,48 @@ using namespace llvm; using namespace llvm::PatternMatch; static bool affectedValuesAreEphemeral(ArrayRef<Value *> Affected) { - // If all the affected uses have only one use (part of the assume), then - // the assume does not provide useful information. Note that additional - // users may appear as a result of inlining and CSE, so we should only - // make this assumption late in the optimization pipeline. - // TODO: Handle dead cyclic usages. - // TODO: Handle multiple dead assumes on the same value. - return all_of(Affected, match_fn(m_OneUse(m_Value()))); + // Check whether all the uses are ephemeral, i.e. recursively only used + // by assumes. In that case, the assume does not provide useful information. + // Note that additional users may appear as a result of inlining and CSE, + // so we should only make this assumption late in the optimization pipeline. + SmallSetVector<Instruction *, 32> Worklist; + auto AddUsers = [&](Value *V) { + for (User *U : V->users()) { + // Bail out if we need to inspect too many users. + if (Worklist.size() >= 32) + return false; + Worklist.insert(cast<Instruction>(U)); + } + return true; + }; + + for (Value *V : Affected) { + // Do not handle assumes on globals for now. The use list for them may + // contain uses in other functions. + if (!isa<Instruction, Argument>(V)) + return false; + + if (!AddUsers(V)) + return false; + } + + for (unsigned Idx = 0; Idx < Worklist.size(); ++Idx) { + Instruction *I = Worklist[Idx]; + + // Use in assume is ephemeral. + if (isa<AssumeInst>(I)) + continue; + + // Use in side-effecting instruction is non-ephemeral. + if (I->mayHaveSideEffects() || I->isTerminator()) + return false; + + // Otherwise, recursively look at the users. + if (!AddUsers(I)) + return false; + } + + return true; } PreservedAnalyses diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 96f52076..ab5c9c9 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3902,7 +3902,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( if (VF.isScalar()) continue; - VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, + *CM.PSE.getSE()); precomputeCosts(*Plan, VF, CostCtx); auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { @@ -4159,7 +4160,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { // Add on other costs that are modelled in VPlan, but not in the legacy // cost model. - VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind, + *CM.PSE.getSE()); VPRegionBlock *VectorRegion = P->getVectorLoopRegion(); assert(VectorRegion && "Expected to have a vector region!"); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( @@ -6834,7 +6836,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF) const { - VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE()); InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); // Now compute and add the VPlan-based cost. @@ -7067,7 +7069,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { // simplifications not accounted for in the legacy cost model. If that's the // case, don't trigger the assertion, as the extra simplifications may cause a // different VF to be picked by the VPlan-based cost model. - VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, + *CM.PSE.getSE()); precomputeCosts(BestPlan, BestFactor.Width, CostCtx); // Verify that the VPlan-based and legacy cost models agree, except for VPlans // with early exits and plans with additional VPlan simplifications. The @@ -8597,7 +8600,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // TODO: Enable following transform when the EVL-version of extended-reduction // and mulacc-reduction are implemented. if (!CM.foldTailWithEVL()) { - VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, + *CM.PSE.getSE()); VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); } @@ -10054,7 +10058,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM, - CM.CostKind); + CM.CostKind, *CM.PSE.getSE()); if (!ForceVectorization && !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx, LVP.getPlanFor(VF.Width), SEL, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 81f1956..728d291 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1750,7 +1750,8 @@ VPCostContext::getOperandInfo(VPValue *V) const { } InstructionCost VPCostContext::getScalarizationOverhead( - Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) { + Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF, + bool AlwaysIncludeReplicatingR) { if (VF.isScalar()) return 0; @@ -1770,7 +1771,9 @@ InstructionCost VPCostContext::getScalarizationOverhead( SmallPtrSet<const VPValue *, 4> UniqueOperands; SmallVector<Type *> Tys; for (auto *Op : Operands) { - if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) || + if (Op->isLiveIn() || + (!AlwaysIncludeReplicatingR && + isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) || !UniqueOperands.insert(Op).second) continue; Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF)); diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index fe59774..2a8baec 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -349,12 +349,14 @@ struct VPCostContext { LoopVectorizationCostModel &CM; SmallPtrSet<Instruction *, 8> SkipCostComputation; TargetTransformInfo::TargetCostKind CostKind; + ScalarEvolution &SE; VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const VPlan &Plan, LoopVectorizationCostModel &CM, - TargetTransformInfo::TargetCostKind CostKind) + TargetTransformInfo::TargetCostKind CostKind, + ScalarEvolution &SE) : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM), - CostKind(CostKind) {} + CostKind(CostKind), SE(SE) {} /// Return the cost for \p UI with \p VF using the legacy cost model as /// fallback until computing the cost of all recipes migrates to VPlan. @@ -374,10 +376,12 @@ struct VPCostContext { /// Estimate the overhead of scalarizing a recipe with result type \p ResultTy /// and \p Operands with \p VF. This is a convenience wrapper for the - /// type-based getScalarizationOverhead API. - InstructionCost getScalarizationOverhead(Type *ResultTy, - ArrayRef<const VPValue *> Operands, - ElementCount VF); + /// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR + /// is true, always compute the cost of scalarizing replicating operands. + InstructionCost + getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands, + ElementCount VF, + bool AlwaysIncludeReplicatingR = false); }; /// This class can be used to assign names to VPValues. For VPValues without diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index cf5e6bf..b5e30cb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3069,6 +3069,61 @@ bool VPReplicateRecipe::shouldPack() const { }); } +/// Returns true if \p Ptr is a pointer computation for which the legacy cost +/// model computes a SCEV expression when computing the address cost. +static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) { + auto *PtrR = Ptr->getDefiningRecipe(); + if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) && + cast<VPReplicateRecipe>(PtrR)->getOpcode() == + Instruction::GetElementPtr) || + isa<VPWidenGEPRecipe>(PtrR))) + return false; + + // We are looking for a GEP where all indices are either loop invariant or + // inductions. + for (VPValue *Opd : drop_begin(PtrR->operands())) { + if (!Opd->isDefinedOutsideLoopRegions() && + !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd)) + return false; + } + + return true; +} + +/// Returns true if \p V is used as part of the address of another load or +/// store. +static bool isUsedByLoadStoreAddress(const VPUser *V) { + SmallPtrSet<const VPUser *, 4> Seen; + SmallVector<const VPUser *> WorkList = {V}; + + while (!WorkList.empty()) { + auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val()); + if (!Cur || !Seen.insert(Cur).second) + continue; + + for (VPUser *U : Cur->users()) { + if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U)) + if (InterleaveR->getAddr() == Cur) + return true; + if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) { + if (RepR->getOpcode() == Instruction::Load && + RepR->getOperand(0) == Cur) + return true; + if (RepR->getOpcode() == Instruction::Store && + RepR->getOperand(1) == Cur) + return true; + } + if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) { + if (MemR->getAddr() == Cur && MemR->isConsecutive()) + return true; + } + } + + append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users()); + } + return false; +} + InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { Instruction *UI = cast<Instruction>(getUnderlyingValue()); @@ -3176,21 +3231,58 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, } case Instruction::Load: case Instruction::Store: { - if (isSingleScalar()) { - bool IsLoad = UI->getOpcode() == Instruction::Load; - Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); - Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1)); - const Align Alignment = getLoadStoreAlignment(UI); - unsigned AS = getLoadStoreAddressSpace(UI); - TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); - InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( - UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI); - return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( - ScalarPtrTy, nullptr, nullptr, Ctx.CostKind); - } + if (VF.isScalable() && !isSingleScalar()) + return InstructionCost::getInvalid(); + // TODO: See getMemInstScalarizationCost for how to handle replicating and // predicated cases. - break; + const VPRegionBlock *ParentRegion = getParent()->getParent(); + if (ParentRegion && ParentRegion->isReplicator()) + break; + + bool IsLoad = UI->getOpcode() == Instruction::Load; + const VPValue *PtrOp = getOperand(!IsLoad); + // TODO: Handle cases where we need to pass a SCEV to + // getAddressComputationCost. + if (shouldUseAddressAccessSCEV(PtrOp)) + break; + + Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); + Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp); + const Align Alignment = getLoadStoreAlignment(UI); + unsigned AS = getLoadStoreAddressSpace(UI); + TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); + InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( + UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo); + + Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF); + + InstructionCost ScalarCost = + ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( + PtrTy, &Ctx.SE, nullptr, Ctx.CostKind); + if (isSingleScalar()) + return ScalarCost; + + SmallVector<const VPValue *> OpsToScalarize; + Type *ResultTy = Type::getVoidTy(PtrTy->getContext()); + // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we + // don't assign scalarization overhead in general, if the target prefers + // vectorized addressing or the loaded value is used as part of an address + // of another load or store. + bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing(); + if (PreferVectorizedAddressing || !isUsedByLoadStoreAddress(this)) { + bool EfficientVectorLoadStore = + Ctx.TTI.supportsEfficientVectorElementLoadStore(); + if (!(IsLoad && !PreferVectorizedAddressing) && + !(!IsLoad && EfficientVectorLoadStore)) + append_range(OpsToScalarize, operands()); + + if (!EfficientVectorLoadStore) + ResultTy = Ctx.Types.inferScalarType(this); + } + + return (ScalarCost * VF.getFixedValue()) + + Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true); } } |