aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp178
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td20
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp4
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp17
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.h3
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp39
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp5
-rw-r--r--llvm/lib/Target/X86/X86FixupInstTuning.cpp54
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp32
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h3
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX512.td48
-rw-r--r--llvm/lib/Target/X86/X86InstrArithmetic.td23
-rw-r--r--llvm/lib/Target/X86/X86InstrFragmentsSIMD.td10
14 files changed, 307 insertions, 134 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a4c1e26..899baa9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8086,13 +8086,76 @@ static SDValue getZT0FrameIndex(MachineFrameInfo &MFI,
DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
}
+// Emit a call to __arm_sme_save or __arm_sme_restore.
+static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI,
+ SelectionDAG &DAG,
+ AArch64FunctionInfo *Info, SDLoc DL,
+ SDValue Chain, bool IsSave) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+ FuncInfo->setSMESaveBufferUsed();
+ TargetLowering::ArgListTy Args;
+ Args.emplace_back(
+ DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64),
+ PointerType::getUnqual(*DAG.getContext()));
+
+ RTLIB::Libcall LC =
+ IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE;
+ SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+ TLI.getPointerTy(DAG.getDataLayout()));
+ auto *RetTy = Type::getVoidTy(*DAG.getContext());
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
+ TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
+ return TLI.LowerCallTo(CLI).second;
+}
+
+static SDValue emitRestoreZALazySave(SDValue Chain, SDLoc DL,
+ const AArch64TargetLowering &TLI,
+ const AArch64RegisterInfo &TRI,
+ AArch64FunctionInfo &FuncInfo,
+ SelectionDAG &DAG) {
+ // Conditionally restore the lazy save using a pseudo node.
+ RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE;
+ TPIDR2Object &TPIDR2 = FuncInfo.getTPIDR2Obj();
+ SDValue RegMask = DAG.getRegisterMask(TRI.getCallPreservedMask(
+ DAG.getMachineFunction(), TLI.getLibcallCallingConv(LC)));
+ SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
+ TLI.getLibcallName(LC), TLI.getPointerTy(DAG.getDataLayout()));
+ SDValue TPIDR2_EL0 = DAG.getNode(
+ ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Chain,
+ DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
+ // Copy the address of the TPIDR2 block into X0 before 'calling' the
+ // RESTORE_ZA pseudo.
+ SDValue Glue;
+ SDValue TPIDR2Block = DAG.getFrameIndex(
+ TPIDR2.FrameIndex,
+ DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
+ Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, TPIDR2Block, Glue);
+ Chain =
+ DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
+ {Chain, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
+ RestoreRoutine, RegMask, Chain.getValue(1)});
+ // Finally reset the TPIDR2_EL0 register to 0.
+ Chain = DAG.getNode(
+ ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
+ DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
+ DAG.getConstant(0, DL, MVT::i64));
+ TPIDR2.Uses++;
+ return Chain;
+}
+
SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL,
SelectionDAG &DAG) const {
assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
SDValue Glue = Chain.getValue(1);
MachineFunction &MF = DAG.getMachineFunction();
- SMEAttrs SMEFnAttrs = MF.getInfo<AArch64FunctionInfo>()->getSMEFnAttrs();
+ auto &FuncInfo = *MF.getInfo<AArch64FunctionInfo>();
+ auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+ const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
+
+ SMEAttrs SMEFnAttrs = FuncInfo.getSMEFnAttrs();
// The following conditions are true on entry to an exception handler:
// - PSTATE.SM is 0.
@@ -8107,14 +8170,43 @@ SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL,
// These mode changes are usually optimized away in catch blocks as they
// occur before the __cxa_begin_catch (which is a non-streaming function),
// but are necessary in some cases (such as for cleanups).
+ //
+ // Additionally, if the function has ZA or ZT0 state, we must restore it.
+ // [COND_]SMSTART SM
if (SMEFnAttrs.hasStreamingInterfaceOrBody())
- return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain,
- /*Glue*/ Glue, AArch64SME::Always);
+ Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain,
+ /*Glue*/ Glue, AArch64SME::Always);
+ else if (SMEFnAttrs.hasStreamingCompatibleInterface())
+ Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
+ AArch64SME::IfCallerIsStreaming);
- if (SMEFnAttrs.hasStreamingCompatibleInterface())
- return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
- AArch64SME::IfCallerIsStreaming);
+ if (getTM().useNewSMEABILowering())
+ return Chain;
+
+ if (SMEFnAttrs.hasAgnosticZAInterface()) {
+ // Restore full ZA
+ Chain = emitSMEStateSaveRestore(*this, DAG, &FuncInfo, DL, Chain,
+ /*IsSave=*/false);
+ } else if (SMEFnAttrs.hasZAState() || SMEFnAttrs.hasZT0State()) {
+ // SMSTART ZA
+ Chain = DAG.getNode(
+ AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
+ DAG.getTargetConstant(int32_t(AArch64SVCR::SVCRZA), DL, MVT::i32));
+
+ // Restore ZT0
+ if (SMEFnAttrs.hasZT0State()) {
+ SDValue ZT0FrameIndex =
+ getZT0FrameIndex(MF.getFrameInfo(), FuncInfo, DAG);
+ Chain =
+ DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
+ {Chain, DAG.getConstant(0, DL, MVT::i32), ZT0FrameIndex});
+ }
+
+ // Restore ZA
+ if (SMEFnAttrs.hasZAState())
+ Chain = emitRestoreZALazySave(Chain, DL, *this, TRI, FuncInfo, DAG);
+ }
return Chain;
}
@@ -9232,30 +9324,6 @@ SDValue AArch64TargetLowering::changeStreamingMode(
return GetCheckVL(SMChange.getValue(0), SMChange.getValue(1));
}
-// Emit a call to __arm_sme_save or __arm_sme_restore.
-static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI,
- SelectionDAG &DAG,
- AArch64FunctionInfo *Info, SDLoc DL,
- SDValue Chain, bool IsSave) {
- MachineFunction &MF = DAG.getMachineFunction();
- AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
- FuncInfo->setSMESaveBufferUsed();
- TargetLowering::ArgListTy Args;
- Args.emplace_back(
- DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64),
- PointerType::getUnqual(*DAG.getContext()));
-
- RTLIB::Libcall LC =
- IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE;
- SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
- TLI.getPointerTy(DAG.getDataLayout()));
- auto *RetTy = Type::getVoidTy(*DAG.getContext());
- TargetLowering::CallLoweringInfo CLI(DAG);
- CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
- TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
- return TLI.LowerCallTo(CLI).second;
-}
-
static AArch64SME::ToggleCondition
getSMToggleCondition(const SMECallAttrs &CallAttrs) {
if (!CallAttrs.caller().hasStreamingCompatibleInterface() ||
@@ -10015,33 +10083,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
{Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
if (RequiresLazySave) {
- // Conditionally restore the lazy save using a pseudo node.
- RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE;
- TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
- SDValue RegMask = DAG.getRegisterMask(
- TRI->getCallPreservedMask(MF, getLibcallCallingConv(LC)));
- SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
- getLibcallName(LC), getPointerTy(DAG.getDataLayout()));
- SDValue TPIDR2_EL0 = DAG.getNode(
- ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
- DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
- // Copy the address of the TPIDR2 block into X0 before 'calling' the
- // RESTORE_ZA pseudo.
- SDValue Glue;
- SDValue TPIDR2Block = DAG.getFrameIndex(
- TPIDR2.FrameIndex,
- DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
- Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
- Result =
- DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
- {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
- RestoreRoutine, RegMask, Result.getValue(1)});
- // Finally reset the TPIDR2_EL0 register to 0.
- Result = DAG.getNode(
- ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
- DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
- DAG.getConstant(0, DL, MVT::i64));
- TPIDR2.Uses++;
+ Result = emitRestoreZALazySave(Result, DL, *this, *TRI, *FuncInfo, DAG);
} else if (RequiresSaveAllZA) {
Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result,
/*IsSave=*/false);
@@ -11736,6 +11778,28 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
return DAG.getNode(ISD::AND, DL, VT, LHS, Shift);
}
+ // Check for sign bit test patterns that can use TST optimization.
+ // (SELECT_CC setlt, sign_extend_inreg, 0, tval, fval)
+ // -> TST %operand, sign_bit; CSEL
+ // (SELECT_CC setlt, sign_extend, 0, tval, fval)
+ // -> TST %operand, sign_bit; CSEL
+ if (CC == ISD::SETLT && RHSC && RHSC->isZero() && LHS.hasOneUse() &&
+ (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG ||
+ LHS.getOpcode() == ISD::SIGN_EXTEND)) {
+
+ uint64_t SignBitPos;
+ std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
+ EVT TestVT = LHS.getValueType();
+ SDValue SignBitConst = DAG.getConstant(1ULL << SignBitPos, DL, TestVT);
+ SDValue TST =
+ DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(TestVT, MVT::i32),
+ LHS, SignBitConst);
+
+ SDValue Flags = TST.getValue(1);
+ return DAG.getNode(AArch64ISD::CSEL, DL, TVal.getValueType(), TVal, FVal,
+ DAG.getConstant(AArch64CC::NE, DL, MVT::i32), Flags);
+ }
+
// Canonicalise absolute difference patterns:
// select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc ->
// select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 04b3c90..f788c75 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -9907,8 +9907,14 @@ def : Pat<(v4bf16 (bitconvert (v2f32 FPR64:$src))),
def : Pat<(v4bf16 (bitconvert (v1f64 FPR64:$src))),
(v4bf16 (REV64v4i16 FPR64:$src))>;
}
-def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
-def : Pat<(v4bf16 (bitconvert (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))),
+ (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v4bf16 FPR64:$src))),
+ (v4f16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (v4i16 FPR64:$src))),
+ (v4bf16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (v4f16 FPR64:$src))),
+ (v4bf16 FPR64:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), (v8i8 FPR64:$src)>;
@@ -10236,8 +10242,14 @@ def : Pat<(v8bf16 (bitconvert (v2f64 FPR128:$src))),
def : Pat<(v8bf16 (bitconvert (v4f32 FPR128:$src))),
(v8bf16 (REV32v8i16 FPR128:$src))>;
}
-def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
-def : Pat<(v8bf16 (bitconvert (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))),
+ (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v8i16 FPR128:$src))),
+ (v8bf16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v8bf16 FPR128:$src))),
+ (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v8f16 FPR128:$src))),
+ (v8bf16 FPR128:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 38718c4..7504f1a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -150,7 +150,10 @@ public:
if (!CVisited.insert(CII).second)
continue;
- if (CII->getParent() == II->getParent() && !IsLookThru(II))
+ // Same-BB filter must look at the *user*; and allow non-lookthrough
+ // users when the def is a PHI (loop-header pattern).
+ if (CII->getParent() == II->getParent() && !IsLookThru(CII) &&
+ !isa<PHINode>(II))
continue;
if (isOpLegal(CII))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index d9bfeae..0a59132 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -2562,7 +2562,9 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
for (Function *F : NeedsPostProcess)
Splitter.processFunction(*F);
for (Function *F : Intrinsics) {
- if (isRemovablePointerIntrinsic(F->getIntrinsicID())) {
+ // use_empty() can also occur with cases like masked load, which will
+ // have been rewritten out of the module by now but not erased.
+ if (F->use_empty() || isRemovablePointerIntrinsic(F->getIntrinsicID())) {
F->eraseFromParent();
} else {
std::optional<Function *> NewF = Intrinsic::remangleIntrinsicFunction(F);
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index ecd003c..098bcfa 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -9559,3 +9559,20 @@ bool LoongArchTargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
EVT ScalarVT = VecVT.getScalarType();
return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
}
+
+bool LoongArchTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
+ unsigned Index) const {
+ if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
+ return false;
+
+ // Extract a 128-bit subvector from index 0 of a 256-bit vector is free.
+ return Index == 0;
+}
+
+bool LoongArchTargetLowering::isExtractVecEltCheap(EVT VT,
+ unsigned Index) const {
+ EVT EltVT = VT.getScalarType();
+
+ // Extract a scalar FP value from index 0 of a vector is free.
+ return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0;
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 3c00296..9b60a9f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -338,6 +338,9 @@ public:
unsigned Depth) const override;
bool shouldScalarizeBinop(SDValue VecOp) const override;
+ bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
+ unsigned Index) const override;
+ bool isExtractVecEltCheap(EVT VT, unsigned Index) const override;
/// Check if a constant splat can be generated using [x]vldi, where imm[12]
/// is 1.
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 563f3bb..d4124ae 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -167,6 +167,42 @@ static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
return false;
}
+// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1),
+// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32,
+// the type will be split so only the lower 32 bits need to be compared using
+// (srai/srli X, C) == C2.
+static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) {
+ if (!Inst->hasOneUse())
+ return false;
+
+ // Look for equality comparison.
+ auto *Cmp = dyn_cast<ICmpInst>(*Inst->user_begin());
+ if (!Cmp || !Cmp->isEquality())
+ return false;
+
+ // Right hand side of comparison should be a constant.
+ auto *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
+ if (!C)
+ return false;
+
+ uint64_t Mask = Imm.getZExtValue();
+
+ // Mask should be of the form -(1 << C) in the lower 32 bits.
+ if (!isUInt<32>(Mask) || !isPowerOf2_32(-uint32_t(Mask)))
+ return false;
+
+ // Comparison constant should be a subset of Mask.
+ uint64_t CmpC = C->getZExtValue();
+ if ((CmpC & Mask) != CmpC)
+ return false;
+
+ // We'll need to sign extend the comparison constant and shift it right. Make
+ // sure the new constant can use addi/xori+seqz/snez.
+ unsigned ShiftBits = llvm::countr_zero(Mask);
+ int64_t NewCmpC = SignExtend64<32>(CmpC) >> ShiftBits;
+ return NewCmpC >= -2048 && NewCmpC <= 2048;
+}
+
InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
const APInt &Imm, Type *Ty,
TTI::TargetCostKind CostKind,
@@ -224,6 +260,9 @@ InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
canUseShiftPair(Inst, Imm))
return TTI::TCC_Free;
+ if (Inst && Idx == 1 && Imm.getBitWidth() == 64 &&
+ canUseShiftCmp(Inst, Imm))
+ return TTI::TCC_Free;
Takes12BitImm = true;
break;
case Instruction::Add:
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index b4fc8da..db85e33 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -587,7 +587,8 @@ bool SPIRVLegalizerInfo::legalizeIsFPClass(
}
if (FPClassTest PartialCheck = Mask & fcNan) {
- auto InfWithQnanBitC = buildSPIRVConstant(IntTy, Inf | QNaNBitMask);
+ auto InfWithQnanBitC =
+ buildSPIRVConstant(IntTy, std::move(Inf) | QNaNBitMask);
if (PartialCheck == fcNan) {
// isnan(V) ==> abs(V) u> int(inf)
appendToRes(
@@ -613,7 +614,7 @@ bool SPIRVLegalizerInfo::legalizeIsFPClass(
APInt ExpLSB = ExpMask & ~(ExpMask.shl(1));
auto ExpMinusOne = assignSPIRVTy(
MIRBuilder.buildSub(IntTy, Abs, buildSPIRVConstant(IntTy, ExpLSB)));
- APInt MaxExpMinusOne = ExpMask - ExpLSB;
+ APInt MaxExpMinusOne = std::move(ExpMask) - ExpLSB;
auto NormalRes = assignSPIRVTy(
MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, ExpMinusOne,
buildSPIRVConstant(IntTy, MaxExpMinusOne)));
diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 33dc0a2..a1d4e0b 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -277,6 +277,22 @@ bool X86FixupInstTuningPass::processInstruction(
return true;
};
+ // Is ADD(X,X) more efficient than SHL(X,1)?
+ auto ProcessShiftLeftToAdd = [&](unsigned AddOpc) -> bool {
+ if (MI.getOperand(NumOperands - 1).getImm() != 1)
+ return false;
+ if (!NewOpcPreferable(AddOpc, /*ReplaceInTie*/ true))
+ return false;
+ LLVM_DEBUG(dbgs() << "Replacing: " << MI);
+ {
+ MI.setDesc(TII->get(AddOpc));
+ MI.removeOperand(NumOperands - 1);
+ MI.addOperand(MI.getOperand(NumOperands - 2));
+ }
+ LLVM_DEBUG(dbgs() << " With: " << MI);
+ return false;
+ };
+
switch (Opc) {
case X86::BLENDPDrri:
return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1);
@@ -563,6 +579,44 @@ bool X86FixupInstTuningPass::processInstruction(
return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmkz);
case X86::VUNPCKHPSZrmkz:
return ProcessUNPCKPS(X86::VPUNPCKHDQZrmkz);
+
+ case X86::PSLLWri:
+ return ProcessShiftLeftToAdd(X86::PADDWrr);
+ case X86::VPSLLWri:
+ return ProcessShiftLeftToAdd(X86::VPADDWrr);
+ case X86::VPSLLWYri:
+ return ProcessShiftLeftToAdd(X86::VPADDWYrr);
+ case X86::VPSLLWZ128ri:
+ return ProcessShiftLeftToAdd(X86::VPADDWZ128rr);
+ case X86::VPSLLWZ256ri:
+ return ProcessShiftLeftToAdd(X86::VPADDWZ256rr);
+ case X86::VPSLLWZri:
+ return ProcessShiftLeftToAdd(X86::VPADDWZrr);
+ case X86::PSLLDri:
+ return ProcessShiftLeftToAdd(X86::PADDDrr);
+ case X86::VPSLLDri:
+ return ProcessShiftLeftToAdd(X86::VPADDDrr);
+ case X86::VPSLLDYri:
+ return ProcessShiftLeftToAdd(X86::VPADDDYrr);
+ case X86::VPSLLDZ128ri:
+ return ProcessShiftLeftToAdd(X86::VPADDDZ128rr);
+ case X86::VPSLLDZ256ri:
+ return ProcessShiftLeftToAdd(X86::VPADDDZ256rr);
+ case X86::VPSLLDZri:
+ return ProcessShiftLeftToAdd(X86::VPADDDZrr);
+ case X86::PSLLQri:
+ return ProcessShiftLeftToAdd(X86::PADDQrr);
+ case X86::VPSLLQri:
+ return ProcessShiftLeftToAdd(X86::VPADDQrr);
+ case X86::VPSLLQYri:
+ return ProcessShiftLeftToAdd(X86::VPADDQYrr);
+ case X86::VPSLLQZ128ri:
+ return ProcessShiftLeftToAdd(X86::VPADDQZ128rr);
+ case X86::VPSLLQZ256ri:
+ return ProcessShiftLeftToAdd(X86::VPADDQZ256rr);
+ case X86::VPSLLQZri:
+ return ProcessShiftLeftToAdd(X86::VPADDQZrr);
+
default:
return false;
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index efeddd7..cd04ff5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4456,8 +4456,8 @@ SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
bool AllowAVX512 = true) {
assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
unsigned NumSubs = 1;
- if ((CheckBWI && Subtarget.useBWIRegs()) ||
- (!CheckBWI && AllowAVX512 && Subtarget.useAVX512Regs())) {
+ if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) ||
+ (!CheckBWI && Subtarget.useAVX512Regs()))) {
if (VT.getSizeInBits() > 512) {
NumSubs = VT.getSizeInBits() / 512;
assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
@@ -30313,22 +30313,8 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
- if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
- // Hardware support for vector shifts is sparse which makes us scalarize the
- // vector operations in many cases. Also, on sandybridge ADD is faster than
- // shl: (shl V, 1) -> (add (freeze V), (freeze V))
- if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
- // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
- // must be 0). (add undef, undef) however can be any value. To make this
- // safe, we must freeze R to ensure that register allocation uses the same
- // register for an undefined value. This ensures that the result will
- // still be even and preserves the original semantics.
- R = DAG.getFreeze(R);
- return DAG.getNode(ISD::ADD, dl, VT, R, R);
- }
-
+ if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
- }
// i64 SRA needs to be performed as partial shifts.
if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
@@ -31229,16 +31215,16 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
unsigned NumElts = VT.getVectorNumElements();
if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
- if (IsFSHR)
- std::swap(Op0, Op1);
if (IsCstSplat) {
+ if (IsFSHR)
+ std::swap(Op0, Op1);
uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
{Op0, Op1, Imm}, DAG, Subtarget);
}
- return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
+ return getAVX512Node(IsFSHR ? ISD::FSHR : ISD::FSHL, DL, VT,
{Op0, Op1, Amt}, DAG, Subtarget);
}
assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
@@ -35153,8 +35139,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(VALIGN)
NODE_NAME_CASE(VSHLD)
NODE_NAME_CASE(VSHRD)
- NODE_NAME_CASE(VSHLDV)
- NODE_NAME_CASE(VSHRDV)
NODE_NAME_CASE(PSHUFD)
NODE_NAME_CASE(PSHUFHW)
NODE_NAME_CASE(PSHUFLW)
@@ -45185,6 +45169,7 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
case X86ISD::Wrapper:
case X86ISD::WrapperRIP:
return true;
+ case X86ISD::INSERTPS:
case X86ISD::BLENDI:
case X86ISD::PSHUFB:
case X86ISD::PSHUFD:
@@ -45255,6 +45240,7 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
case X86ISD::BLENDV:
return false;
// SSE target shuffles.
+ case X86ISD::INSERTPS:
case X86ISD::PSHUFB:
case X86ISD::PSHUFD:
case X86ISD::UNPCKL:
@@ -46211,7 +46197,7 @@ static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,
SDValue Zero = DAG.getConstant(0, DL, DpVT);
return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
- DpBuilder, false);
+ DpBuilder, /*CheckBWI=*/false, Subtarget.hasVNNI());
}
// Create a PSADBW given two sources representable as zexts of vXi8.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 8ab8c66..b55556a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -471,8 +471,7 @@ namespace llvm {
// VBMI2 Concat & Shift.
VSHLD,
VSHRD,
- VSHLDV,
- VSHRDV,
+
// Shuffle Packed Values at 128-bit granularity.
SHUF128,
MOVDDUP,
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 2371ed4..564810c 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -12300,72 +12300,76 @@ defm : vpclmulqdq_aliases<"VPCLMULQDQZ256", VR256X, i256mem>;
// VBMI2
//===----------------------------------------------------------------------===//
-multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode,
+multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR,
X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
let Constraints = "$src1 = $dst",
ExeDomain = VTI.ExeDomain in {
defm r: AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst),
(ins VTI.RC:$src2, VTI.RC:$src3), OpStr,
"$src3, $src2", "$src2, $src3",
- (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, VTI.RC:$src3))>,
+ !if(SwapLR,
+ (VTI.VT (OpNode (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src3))),
+ (VTI.VT (OpNode (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src3))))>,
T8, PD, EVEX, VVVV, Sched<[sched]>;
defm m: AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
(ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
"$src3, $src2", "$src2, $src3",
- (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
- (VTI.VT (VTI.LdFrag addr:$src3))))>,
+ !if(SwapLR,
+ (VTI.VT (OpNode (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src1), (VTI.VT (VTI.LdFrag addr:$src3)))),
+ (VTI.VT (OpNode (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2), (VTI.VT (VTI.LdFrag addr:$src3)))))>,
T8, PD, EVEX, VVVV,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
-multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode,
+multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR,
X86FoldableSchedWrite sched, X86VectorVTInfo VTI>
- : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched, VTI> {
+ : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched, VTI> {
let Constraints = "$src1 = $dst",
ExeDomain = VTI.ExeDomain in
defm mb: AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
(ins VTI.RC:$src2, VTI.ScalarMemOp:$src3), OpStr,
"${src3}"#VTI.BroadcastStr#", $src2",
"$src2, ${src3}"#VTI.BroadcastStr,
- (OpNode VTI.RC:$src1, VTI.RC:$src2,
- (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
+ !if(SwapLR,
+ (OpNode (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src1), (VTI.VT (VTI.BroadcastLdFrag addr:$src3))),
+ (OpNode (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2), (VTI.VT (VTI.BroadcastLdFrag addr:$src3))))>,
T8, PD, EVEX, VVVV, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
-multiclass VBMI2_shift_var_rm_common<bits<8> Op, string OpStr, SDNode OpNode,
+multiclass VBMI2_shift_var_rm_common<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR,
X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> {
let Predicates = [HasVBMI2] in
- defm Z : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.ZMM, VTI.info512>,
+ defm Z : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched.ZMM, VTI.info512>,
EVEX_V512;
let Predicates = [HasVBMI2, HasVLX] in {
- defm Z256 : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.YMM, VTI.info256>,
+ defm Z256 : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched.YMM, VTI.info256>,
EVEX_V256;
- defm Z128 : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.XMM, VTI.info128>,
+ defm Z128 : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched.XMM, VTI.info128>,
EVEX_V128;
}
}
-multiclass VBMI2_shift_var_rmb_common<bits<8> Op, string OpStr, SDNode OpNode,
+multiclass VBMI2_shift_var_rmb_common<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR,
X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> {
let Predicates = [HasVBMI2] in
- defm Z : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.ZMM, VTI.info512>,
+ defm Z : VBMI2_shift_var_rmb<Op, OpStr, OpNode, SwapLR, sched.ZMM, VTI.info512>,
EVEX_V512;
let Predicates = [HasVBMI2, HasVLX] in {
- defm Z256 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.YMM, VTI.info256>,
+ defm Z256 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, SwapLR, sched.YMM, VTI.info256>,
EVEX_V256;
- defm Z128 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.XMM, VTI.info128>,
+ defm Z128 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, SwapLR, sched.XMM, VTI.info128>,
EVEX_V128;
}
}
multiclass VBMI2_shift_var<bits<8> wOp, bits<8> dqOp, string Prefix,
- SDNode OpNode, X86SchedWriteWidths sched> {
- defm W : VBMI2_shift_var_rm_common<wOp, Prefix#"w", OpNode, sched,
+ SDNode OpNode, bit SwapLR, X86SchedWriteWidths sched> {
+ defm W : VBMI2_shift_var_rm_common<wOp, Prefix#"w", OpNode, SwapLR, sched,
avx512vl_i16_info>, REX_W, EVEX_CD8<16, CD8VF>;
- defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix#"d", OpNode, sched,
+ defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix#"d", OpNode, SwapLR, sched,
avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
- defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix#"q", OpNode, sched,
+ defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix#"q", OpNode, SwapLR, sched,
avx512vl_i64_info>, REX_W, EVEX_CD8<64, CD8VF>;
}
@@ -12381,8 +12385,8 @@ multiclass VBMI2_shift_imm<bits<8> wOp, bits<8> dqOp, string Prefix,
}
// Concat & Shift
-defm VPSHLDV : VBMI2_shift_var<0x70, 0x71, "vpshldv", X86VShldv, SchedWriteVecIMul>;
-defm VPSHRDV : VBMI2_shift_var<0x72, 0x73, "vpshrdv", X86VShrdv, SchedWriteVecIMul>;
+defm VPSHLDV : VBMI2_shift_var<0x70, 0x71, "vpshldv", fshl, 0, SchedWriteVecIMul>;
+defm VPSHRDV : VBMI2_shift_var<0x72, 0x73, "vpshrdv", fshr, 1, SchedWriteVecIMul>;
defm VPSHLD : VBMI2_shift_imm<0x70, 0x71, "vpshld", X86VShld, SchedWriteVecIMul>;
defm VPSHRD : VBMI2_shift_imm<0x72, 0x73, "vpshrd", X86VShrd, SchedWriteVecIMul>;
diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index b4768590..031fdc1 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -25,18 +25,12 @@ let SchedRW = [WriteLEA] in {
[(set GR32:$dst, lea32addr:$src)]>,
OpSize32, Requires<[Not64BitMode]>;
- let Predicates = [HasNDD], isCodeGenOnly = 1 in {
- def LEA64_8r : I<0x8D, MRMSrcMem, (outs GR8:$dst), (ins lea64_8mem:$src),
- "lea{b}\t{$src|$dst}, {$dst|$src}",
- [(set GR8:$dst, lea64_iaddr:$src)]>,
- OpSize16,
- Requires<[In64BitMode]>;
-
- def LEA64_16r : I<0x8D, MRMSrcMem, (outs GR16:$dst), (ins lea64_16mem:$src),
- "lea{w}\t{$src|$dst}, {$dst|$src}",
- [(set GR16:$dst, lea64_iaddr:$src)]>,
- OpSize16,
- Requires<[In64BitMode]>;
+ let isCodeGenOnly = 1 in {
+ def LEA64_8r : I<0x8D, MRMSrcMem, (outs GR32:$dst), (ins lea64_8mem:$src),
+ "lea{l}\t{$src|$dst}, {$dst|$src}", []>, OpSize32;
+
+ def LEA64_16r : I<0x8D, MRMSrcMem, (outs GR32:$dst), (ins lea64_16mem:$src),
+ "lea{l}\t{$src|$dst}, {$dst|$src}", []>, OpSize32;
}
def LEA64_32r : I<0x8D, MRMSrcMem, (outs GR32:$dst), (ins lea64_32mem:$src),
@@ -51,6 +45,11 @@ let SchedRW = [WriteLEA] in {
[(set GR64:$dst, lea64addr:$src)]>;
} // SchedRW
+let Predicates = [HasNDD] in {
+ def : Pat<(i8 lea64_iaddr:$src), (EXTRACT_SUBREG (LEA64_8r lea64_8mem:$src), sub_8bit)>;
+ def : Pat<(i16 lea64_iaddr:$src), (EXTRACT_SUBREG (LEA64_16r lea64_16mem:$src), sub_16bit)>;
+}
+
// Pseudo instruction for lea that prevent optimizer from eliminating
// the instruction.
let SchedRW = [WriteLEA], isPseudo = true, hasSideEffects = 1 in {
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 0c20ffe..5321ecf 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -406,16 +406,6 @@ def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>;
def X86VShld : SDNode<"X86ISD::VSHLD", SDTShuff3OpI>;
def X86VShrd : SDNode<"X86ISD::VSHRD", SDTShuff3OpI>;
-def X86VShldv : SDNode<"X86ISD::VSHLDV",
- SDTypeProfile<1, 3, [SDTCisVec<0>,
- SDTCisSameAs<0,1>,
- SDTCisSameAs<0,2>,
- SDTCisSameAs<0,3>]>>;
-def X86VShrdv : SDNode<"X86ISD::VSHRDV",
- SDTypeProfile<1, 3, [SDTCisVec<0>,
- SDTCisSameAs<0,1>,
- SDTCisSameAs<0,2>,
- SDTCisSameAs<0,3>]>>;
def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>;