diff options
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 178 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64InstrInfo.td | 20 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp | 17 | ||||
-rw-r--r-- | llvm/lib/Target/LoongArch/LoongArchISelLowering.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 39 | ||||
-rw-r--r-- | llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86FixupInstTuning.cpp | 54 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 32 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 48 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrArithmetic.td | 23 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 10 |
14 files changed, 307 insertions, 134 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a4c1e26..899baa9 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8086,13 +8086,76 @@ static SDValue getZT0FrameIndex(MachineFrameInfo &MFI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); } +// Emit a call to __arm_sme_save or __arm_sme_restore. +static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, + SelectionDAG &DAG, + AArch64FunctionInfo *Info, SDLoc DL, + SDValue Chain, bool IsSave) { + MachineFunction &MF = DAG.getMachineFunction(); + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + FuncInfo->setSMESaveBufferUsed(); + TargetLowering::ArgListTy Args; + Args.emplace_back( + DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64), + PointerType::getUnqual(*DAG.getContext())); + + RTLIB::Libcall LC = + IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE; + SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), + TLI.getPointerTy(DAG.getDataLayout())); + auto *RetTy = Type::getVoidTy(*DAG.getContext()); + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( + TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)); + return TLI.LowerCallTo(CLI).second; +} + +static SDValue emitRestoreZALazySave(SDValue Chain, SDLoc DL, + const AArch64TargetLowering &TLI, + const AArch64RegisterInfo &TRI, + AArch64FunctionInfo &FuncInfo, + SelectionDAG &DAG) { + // Conditionally restore the lazy save using a pseudo node. + RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE; + TPIDR2Object &TPIDR2 = FuncInfo.getTPIDR2Obj(); + SDValue RegMask = DAG.getRegisterMask(TRI.getCallPreservedMask( + DAG.getMachineFunction(), TLI.getLibcallCallingConv(LC))); + SDValue RestoreRoutine = DAG.getTargetExternalSymbol( + TLI.getLibcallName(LC), TLI.getPointerTy(DAG.getDataLayout())); + SDValue TPIDR2_EL0 = DAG.getNode( + ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Chain, + DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); + // Copy the address of the TPIDR2 block into X0 before 'calling' the + // RESTORE_ZA pseudo. + SDValue Glue; + SDValue TPIDR2Block = DAG.getFrameIndex( + TPIDR2.FrameIndex, + DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); + Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, TPIDR2Block, Glue); + Chain = + DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other, + {Chain, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64), + RestoreRoutine, RegMask, Chain.getValue(1)}); + // Finally reset the TPIDR2_EL0 register to 0. + Chain = DAG.getNode( + ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, + DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i64)); + TPIDR2.Uses++; + return Chain; +} + SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL, SelectionDAG &DAG) const { assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value"); SDValue Glue = Chain.getValue(1); MachineFunction &MF = DAG.getMachineFunction(); - SMEAttrs SMEFnAttrs = MF.getInfo<AArch64FunctionInfo>()->getSMEFnAttrs(); + auto &FuncInfo = *MF.getInfo<AArch64FunctionInfo>(); + auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); + const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo(); + + SMEAttrs SMEFnAttrs = FuncInfo.getSMEFnAttrs(); // The following conditions are true on entry to an exception handler: // - PSTATE.SM is 0. @@ -8107,14 +8170,43 @@ SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL, // These mode changes are usually optimized away in catch blocks as they // occur before the __cxa_begin_catch (which is a non-streaming function), // but are necessary in some cases (such as for cleanups). + // + // Additionally, if the function has ZA or ZT0 state, we must restore it. + // [COND_]SMSTART SM if (SMEFnAttrs.hasStreamingInterfaceOrBody()) - return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, - /*Glue*/ Glue, AArch64SME::Always); + Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, + /*Glue*/ Glue, AArch64SME::Always); + else if (SMEFnAttrs.hasStreamingCompatibleInterface()) + Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue, + AArch64SME::IfCallerIsStreaming); - if (SMEFnAttrs.hasStreamingCompatibleInterface()) - return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue, - AArch64SME::IfCallerIsStreaming); + if (getTM().useNewSMEABILowering()) + return Chain; + + if (SMEFnAttrs.hasAgnosticZAInterface()) { + // Restore full ZA + Chain = emitSMEStateSaveRestore(*this, DAG, &FuncInfo, DL, Chain, + /*IsSave=*/false); + } else if (SMEFnAttrs.hasZAState() || SMEFnAttrs.hasZT0State()) { + // SMSTART ZA + Chain = DAG.getNode( + AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, + DAG.getTargetConstant(int32_t(AArch64SVCR::SVCRZA), DL, MVT::i32)); + + // Restore ZT0 + if (SMEFnAttrs.hasZT0State()) { + SDValue ZT0FrameIndex = + getZT0FrameIndex(MF.getFrameInfo(), FuncInfo, DAG); + Chain = + DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other), + {Chain, DAG.getConstant(0, DL, MVT::i32), ZT0FrameIndex}); + } + + // Restore ZA + if (SMEFnAttrs.hasZAState()) + Chain = emitRestoreZALazySave(Chain, DL, *this, TRI, FuncInfo, DAG); + } return Chain; } @@ -9232,30 +9324,6 @@ SDValue AArch64TargetLowering::changeStreamingMode( return GetCheckVL(SMChange.getValue(0), SMChange.getValue(1)); } -// Emit a call to __arm_sme_save or __arm_sme_restore. -static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, - SelectionDAG &DAG, - AArch64FunctionInfo *Info, SDLoc DL, - SDValue Chain, bool IsSave) { - MachineFunction &MF = DAG.getMachineFunction(); - AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); - FuncInfo->setSMESaveBufferUsed(); - TargetLowering::ArgListTy Args; - Args.emplace_back( - DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64), - PointerType::getUnqual(*DAG.getContext())); - - RTLIB::Libcall LC = - IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE; - SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), - TLI.getPointerTy(DAG.getDataLayout())); - auto *RetTy = Type::getVoidTy(*DAG.getContext()); - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( - TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)); - return TLI.LowerCallTo(CLI).second; -} - static AArch64SME::ToggleCondition getSMToggleCondition(const SMECallAttrs &CallAttrs) { if (!CallAttrs.caller().hasStreamingCompatibleInterface() || @@ -10015,33 +10083,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx}); if (RequiresLazySave) { - // Conditionally restore the lazy save using a pseudo node. - RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE; - TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); - SDValue RegMask = DAG.getRegisterMask( - TRI->getCallPreservedMask(MF, getLibcallCallingConv(LC))); - SDValue RestoreRoutine = DAG.getTargetExternalSymbol( - getLibcallName(LC), getPointerTy(DAG.getDataLayout())); - SDValue TPIDR2_EL0 = DAG.getNode( - ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result, - DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); - // Copy the address of the TPIDR2 block into X0 before 'calling' the - // RESTORE_ZA pseudo. - SDValue Glue; - SDValue TPIDR2Block = DAG.getFrameIndex( - TPIDR2.FrameIndex, - DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); - Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue); - Result = - DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other, - {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64), - RestoreRoutine, RegMask, Result.getValue(1)}); - // Finally reset the TPIDR2_EL0 register to 0. - Result = DAG.getNode( - ISD::INTRINSIC_VOID, DL, MVT::Other, Result, - DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i64)); - TPIDR2.Uses++; + Result = emitRestoreZALazySave(Result, DL, *this, *TRI, *FuncInfo, DAG); } else if (RequiresSaveAllZA) { Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result, /*IsSave=*/false); @@ -11736,6 +11778,28 @@ SDValue AArch64TargetLowering::LowerSELECT_CC( return DAG.getNode(ISD::AND, DL, VT, LHS, Shift); } + // Check for sign bit test patterns that can use TST optimization. + // (SELECT_CC setlt, sign_extend_inreg, 0, tval, fval) + // -> TST %operand, sign_bit; CSEL + // (SELECT_CC setlt, sign_extend, 0, tval, fval) + // -> TST %operand, sign_bit; CSEL + if (CC == ISD::SETLT && RHSC && RHSC->isZero() && LHS.hasOneUse() && + (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG || + LHS.getOpcode() == ISD::SIGN_EXTEND)) { + + uint64_t SignBitPos; + std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS); + EVT TestVT = LHS.getValueType(); + SDValue SignBitConst = DAG.getConstant(1ULL << SignBitPos, DL, TestVT); + SDValue TST = + DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(TestVT, MVT::i32), + LHS, SignBitConst); + + SDValue Flags = TST.getValue(1); + return DAG.getNode(AArch64ISD::CSEL, DL, TVal.getValueType(), TVal, FVal, + DAG.getConstant(AArch64CC::NE, DL, MVT::i32), Flags); + } + // Canonicalise absolute difference patterns: // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc -> // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 04b3c90..f788c75 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -9907,8 +9907,14 @@ def : Pat<(v4bf16 (bitconvert (v2f32 FPR64:$src))), def : Pat<(v4bf16 (bitconvert (v1f64 FPR64:$src))), (v4bf16 (REV64v4i16 FPR64:$src))>; } -def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>; -def : Pat<(v4bf16 (bitconvert (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), + (v4f16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (v4bf16 FPR64:$src))), + (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v4i16 FPR64:$src))), + (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v4f16 FPR64:$src))), + (v4bf16 FPR64:$src)>; let Predicates = [IsLE] in { def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), (v8i8 FPR64:$src)>; @@ -10236,8 +10242,14 @@ def : Pat<(v8bf16 (bitconvert (v2f64 FPR128:$src))), def : Pat<(v8bf16 (bitconvert (v4f32 FPR128:$src))), (v8bf16 (REV32v8i16 FPR128:$src))>; } -def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>; -def : Pat<(v8bf16 (bitconvert (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), + (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v8i16 FPR128:$src))), + (v8bf16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v8bf16 FPR128:$src))), + (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v8f16 FPR128:$src))), + (v8bf16 FPR128:$src)>; let Predicates = [IsLE] in { def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp index 38718c4..7504f1a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -150,7 +150,10 @@ public: if (!CVisited.insert(CII).second) continue; - if (CII->getParent() == II->getParent() && !IsLookThru(II)) + // Same-BB filter must look at the *user*; and allow non-lookthrough + // users when the def is a PHI (loop-header pattern). + if (CII->getParent() == II->getParent() && !IsLookThru(CII) && + !isa<PHINode>(II)) continue; if (isOpLegal(CII)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index d9bfeae..0a59132 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -2562,7 +2562,9 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) { for (Function *F : NeedsPostProcess) Splitter.processFunction(*F); for (Function *F : Intrinsics) { - if (isRemovablePointerIntrinsic(F->getIntrinsicID())) { + // use_empty() can also occur with cases like masked load, which will + // have been rewritten out of the module by now but not erased. + if (F->use_empty() || isRemovablePointerIntrinsic(F->getIntrinsicID())) { F->eraseFromParent(); } else { std::optional<Function *> NewF = Intrinsic::remangleIntrinsicFunction(F); diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index ecd003c..098bcfa 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -9559,3 +9559,20 @@ bool LoongArchTargetLowering::shouldScalarizeBinop(SDValue VecOp) const { EVT ScalarVT = VecVT.getScalarType(); return isOperationLegalOrCustomOrPromote(Opc, ScalarVT); } + +bool LoongArchTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, + unsigned Index) const { + if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) + return false; + + // Extract a 128-bit subvector from index 0 of a 256-bit vector is free. + return Index == 0; +} + +bool LoongArchTargetLowering::isExtractVecEltCheap(EVT VT, + unsigned Index) const { + EVT EltVT = VT.getScalarType(); + + // Extract a scalar FP value from index 0 of a vector is free. + return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0; +} diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 3c00296..9b60a9f 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -338,6 +338,9 @@ public: unsigned Depth) const override; bool shouldScalarizeBinop(SDValue VecOp) const override; + bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, + unsigned Index) const override; + bool isExtractVecEltCheap(EVT VT, unsigned Index) const override; /// Check if a constant splat can be generated using [x]vldi, where imm[12] /// is 1. diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 563f3bb..d4124ae 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -167,6 +167,42 @@ static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) { return false; } +// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1), +// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32, +// the type will be split so only the lower 32 bits need to be compared using +// (srai/srli X, C) == C2. +static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) { + if (!Inst->hasOneUse()) + return false; + + // Look for equality comparison. + auto *Cmp = dyn_cast<ICmpInst>(*Inst->user_begin()); + if (!Cmp || !Cmp->isEquality()) + return false; + + // Right hand side of comparison should be a constant. + auto *C = dyn_cast<ConstantInt>(Cmp->getOperand(1)); + if (!C) + return false; + + uint64_t Mask = Imm.getZExtValue(); + + // Mask should be of the form -(1 << C) in the lower 32 bits. + if (!isUInt<32>(Mask) || !isPowerOf2_32(-uint32_t(Mask))) + return false; + + // Comparison constant should be a subset of Mask. + uint64_t CmpC = C->getZExtValue(); + if ((CmpC & Mask) != CmpC) + return false; + + // We'll need to sign extend the comparison constant and shift it right. Make + // sure the new constant can use addi/xori+seqz/snez. + unsigned ShiftBits = llvm::countr_zero(Mask); + int64_t NewCmpC = SignExtend64<32>(CmpC) >> ShiftBits; + return NewCmpC >= -2048 && NewCmpC <= 2048; +} + InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, @@ -224,6 +260,9 @@ InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() && canUseShiftPair(Inst, Imm)) return TTI::TCC_Free; + if (Inst && Idx == 1 && Imm.getBitWidth() == 64 && + canUseShiftCmp(Inst, Imm)) + return TTI::TCC_Free; Takes12BitImm = true; break; case Instruction::Add: diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp index b4fc8da..db85e33 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp @@ -587,7 +587,8 @@ bool SPIRVLegalizerInfo::legalizeIsFPClass( } if (FPClassTest PartialCheck = Mask & fcNan) { - auto InfWithQnanBitC = buildSPIRVConstant(IntTy, Inf | QNaNBitMask); + auto InfWithQnanBitC = + buildSPIRVConstant(IntTy, std::move(Inf) | QNaNBitMask); if (PartialCheck == fcNan) { // isnan(V) ==> abs(V) u> int(inf) appendToRes( @@ -613,7 +614,7 @@ bool SPIRVLegalizerInfo::legalizeIsFPClass( APInt ExpLSB = ExpMask & ~(ExpMask.shl(1)); auto ExpMinusOne = assignSPIRVTy( MIRBuilder.buildSub(IntTy, Abs, buildSPIRVConstant(IntTy, ExpLSB))); - APInt MaxExpMinusOne = ExpMask - ExpLSB; + APInt MaxExpMinusOne = std::move(ExpMask) - ExpLSB; auto NormalRes = assignSPIRVTy( MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, ExpMinusOne, buildSPIRVConstant(IntTy, MaxExpMinusOne))); diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp index 33dc0a2..a1d4e0b 100644 --- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp +++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp @@ -277,6 +277,22 @@ bool X86FixupInstTuningPass::processInstruction( return true; }; + // Is ADD(X,X) more efficient than SHL(X,1)? + auto ProcessShiftLeftToAdd = [&](unsigned AddOpc) -> bool { + if (MI.getOperand(NumOperands - 1).getImm() != 1) + return false; + if (!NewOpcPreferable(AddOpc, /*ReplaceInTie*/ true)) + return false; + LLVM_DEBUG(dbgs() << "Replacing: " << MI); + { + MI.setDesc(TII->get(AddOpc)); + MI.removeOperand(NumOperands - 1); + MI.addOperand(MI.getOperand(NumOperands - 2)); + } + LLVM_DEBUG(dbgs() << " With: " << MI); + return false; + }; + switch (Opc) { case X86::BLENDPDrri: return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1); @@ -563,6 +579,44 @@ bool X86FixupInstTuningPass::processInstruction( return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmkz); case X86::VUNPCKHPSZrmkz: return ProcessUNPCKPS(X86::VPUNPCKHDQZrmkz); + + case X86::PSLLWri: + return ProcessShiftLeftToAdd(X86::PADDWrr); + case X86::VPSLLWri: + return ProcessShiftLeftToAdd(X86::VPADDWrr); + case X86::VPSLLWYri: + return ProcessShiftLeftToAdd(X86::VPADDWYrr); + case X86::VPSLLWZ128ri: + return ProcessShiftLeftToAdd(X86::VPADDWZ128rr); + case X86::VPSLLWZ256ri: + return ProcessShiftLeftToAdd(X86::VPADDWZ256rr); + case X86::VPSLLWZri: + return ProcessShiftLeftToAdd(X86::VPADDWZrr); + case X86::PSLLDri: + return ProcessShiftLeftToAdd(X86::PADDDrr); + case X86::VPSLLDri: + return ProcessShiftLeftToAdd(X86::VPADDDrr); + case X86::VPSLLDYri: + return ProcessShiftLeftToAdd(X86::VPADDDYrr); + case X86::VPSLLDZ128ri: + return ProcessShiftLeftToAdd(X86::VPADDDZ128rr); + case X86::VPSLLDZ256ri: + return ProcessShiftLeftToAdd(X86::VPADDDZ256rr); + case X86::VPSLLDZri: + return ProcessShiftLeftToAdd(X86::VPADDDZrr); + case X86::PSLLQri: + return ProcessShiftLeftToAdd(X86::PADDQrr); + case X86::VPSLLQri: + return ProcessShiftLeftToAdd(X86::VPADDQrr); + case X86::VPSLLQYri: + return ProcessShiftLeftToAdd(X86::VPADDQYrr); + case X86::VPSLLQZ128ri: + return ProcessShiftLeftToAdd(X86::VPADDQZ128rr); + case X86::VPSLLQZ256ri: + return ProcessShiftLeftToAdd(X86::VPADDQZ256rr); + case X86::VPSLLQZri: + return ProcessShiftLeftToAdd(X86::VPADDQZrr); + default: return false; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index efeddd7..cd04ff5 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4456,8 +4456,8 @@ SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, bool AllowAVX512 = true) { assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2"); unsigned NumSubs = 1; - if ((CheckBWI && Subtarget.useBWIRegs()) || - (!CheckBWI && AllowAVX512 && Subtarget.useAVX512Regs())) { + if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) || + (!CheckBWI && Subtarget.useAVX512Regs()))) { if (VT.getSizeInBits() > 512) { NumSubs = VT.getSizeInBits() / 512; assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size"); @@ -30313,22 +30313,8 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, uint64_t ShiftAmt = APIntShiftAmt.getZExtValue(); - if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) { - // Hardware support for vector shifts is sparse which makes us scalarize the - // vector operations in many cases. Also, on sandybridge ADD is faster than - // shl: (shl V, 1) -> (add (freeze V), (freeze V)) - if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) { - // R may be undef at run-time, but (shl R, 1) must be an even number (LSB - // must be 0). (add undef, undef) however can be any value. To make this - // safe, we must freeze R to ensure that register allocation uses the same - // register for an undefined value. This ensures that the result will - // still be even and preserves the original semantics. - R = DAG.getFreeze(R); - return DAG.getNode(ISD::ADD, dl, VT, R, R); - } - + if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); - } // i64 SRA needs to be performed as partial shifts. if (((!Subtarget.hasXOP() && VT == MVT::v2i64) || @@ -31229,16 +31215,16 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, unsigned NumElts = VT.getVectorNumElements(); if (Subtarget.hasVBMI2() && EltSizeInBits > 8) { - if (IsFSHR) - std::swap(Op0, Op1); if (IsCstSplat) { + if (IsFSHR) + std::swap(Op0, Op1); uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits); SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8); return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, {Op0, Op1, Imm}, DAG, Subtarget); } - return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT, + return getAVX512Node(IsFSHR ? ISD::FSHR : ISD::FSHL, DL, VT, {Op0, Op1, Amt}, DAG, Subtarget); } assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || @@ -35153,8 +35139,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(VALIGN) NODE_NAME_CASE(VSHLD) NODE_NAME_CASE(VSHRD) - NODE_NAME_CASE(VSHLDV) - NODE_NAME_CASE(VSHRDV) NODE_NAME_CASE(PSHUFD) NODE_NAME_CASE(PSHUFHW) NODE_NAME_CASE(PSHUFLW) @@ -45185,6 +45169,7 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( case X86ISD::Wrapper: case X86ISD::WrapperRIP: return true; + case X86ISD::INSERTPS: case X86ISD::BLENDI: case X86ISD::PSHUFB: case X86ISD::PSHUFD: @@ -45255,6 +45240,7 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( case X86ISD::BLENDV: return false; // SSE target shuffles. + case X86ISD::INSERTPS: case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::UNPCKL: @@ -46211,7 +46197,7 @@ static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, SDValue Zero = DAG.getConstant(0, DL, DpVT); return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1}, - DpBuilder, false); + DpBuilder, /*CheckBWI=*/false, Subtarget.hasVNNI()); } // Create a PSADBW given two sources representable as zexts of vXi8. diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 8ab8c66..b55556a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -471,8 +471,7 @@ namespace llvm { // VBMI2 Concat & Shift. VSHLD, VSHRD, - VSHLDV, - VSHRDV, + // Shuffle Packed Values at 128-bit granularity. SHUF128, MOVDDUP, diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 2371ed4..564810c 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -12300,72 +12300,76 @@ defm : vpclmulqdq_aliases<"VPCLMULQDQZ256", VR256X, i256mem>; // VBMI2 //===----------------------------------------------------------------------===// -multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode, +multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR, X86FoldableSchedWrite sched, X86VectorVTInfo VTI> { let Constraints = "$src1 = $dst", ExeDomain = VTI.ExeDomain in { defm r: AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst), (ins VTI.RC:$src2, VTI.RC:$src3), OpStr, "$src3, $src2", "$src2, $src3", - (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, VTI.RC:$src3))>, + !if(SwapLR, + (VTI.VT (OpNode (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src3))), + (VTI.VT (OpNode (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src3))))>, T8, PD, EVEX, VVVV, Sched<[sched]>; defm m: AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst), (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr, "$src3, $src2", "$src2, $src3", - (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, - (VTI.VT (VTI.LdFrag addr:$src3))))>, + !if(SwapLR, + (VTI.VT (OpNode (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src1), (VTI.VT (VTI.LdFrag addr:$src3)))), + (VTI.VT (OpNode (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2), (VTI.VT (VTI.LdFrag addr:$src3)))))>, T8, PD, EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>; } } -multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode, +multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR, X86FoldableSchedWrite sched, X86VectorVTInfo VTI> - : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched, VTI> { + : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched, VTI> { let Constraints = "$src1 = $dst", ExeDomain = VTI.ExeDomain in defm mb: AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst), (ins VTI.RC:$src2, VTI.ScalarMemOp:$src3), OpStr, "${src3}"#VTI.BroadcastStr#", $src2", "$src2, ${src3}"#VTI.BroadcastStr, - (OpNode VTI.RC:$src1, VTI.RC:$src2, - (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>, + !if(SwapLR, + (OpNode (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src1), (VTI.VT (VTI.BroadcastLdFrag addr:$src3))), + (OpNode (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2), (VTI.VT (VTI.BroadcastLdFrag addr:$src3))))>, T8, PD, EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } -multiclass VBMI2_shift_var_rm_common<bits<8> Op, string OpStr, SDNode OpNode, +multiclass VBMI2_shift_var_rm_common<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR, X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> { let Predicates = [HasVBMI2] in - defm Z : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.ZMM, VTI.info512>, + defm Z : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched.ZMM, VTI.info512>, EVEX_V512; let Predicates = [HasVBMI2, HasVLX] in { - defm Z256 : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.YMM, VTI.info256>, + defm Z256 : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched.YMM, VTI.info256>, EVEX_V256; - defm Z128 : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.XMM, VTI.info128>, + defm Z128 : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched.XMM, VTI.info128>, EVEX_V128; } } -multiclass VBMI2_shift_var_rmb_common<bits<8> Op, string OpStr, SDNode OpNode, +multiclass VBMI2_shift_var_rmb_common<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR, X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> { let Predicates = [HasVBMI2] in - defm Z : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.ZMM, VTI.info512>, + defm Z : VBMI2_shift_var_rmb<Op, OpStr, OpNode, SwapLR, sched.ZMM, VTI.info512>, EVEX_V512; let Predicates = [HasVBMI2, HasVLX] in { - defm Z256 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.YMM, VTI.info256>, + defm Z256 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, SwapLR, sched.YMM, VTI.info256>, EVEX_V256; - defm Z128 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.XMM, VTI.info128>, + defm Z128 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, SwapLR, sched.XMM, VTI.info128>, EVEX_V128; } } multiclass VBMI2_shift_var<bits<8> wOp, bits<8> dqOp, string Prefix, - SDNode OpNode, X86SchedWriteWidths sched> { - defm W : VBMI2_shift_var_rm_common<wOp, Prefix#"w", OpNode, sched, + SDNode OpNode, bit SwapLR, X86SchedWriteWidths sched> { + defm W : VBMI2_shift_var_rm_common<wOp, Prefix#"w", OpNode, SwapLR, sched, avx512vl_i16_info>, REX_W, EVEX_CD8<16, CD8VF>; - defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix#"d", OpNode, sched, + defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix#"d", OpNode, SwapLR, sched, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; - defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix#"q", OpNode, sched, + defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix#"q", OpNode, SwapLR, sched, avx512vl_i64_info>, REX_W, EVEX_CD8<64, CD8VF>; } @@ -12381,8 +12385,8 @@ multiclass VBMI2_shift_imm<bits<8> wOp, bits<8> dqOp, string Prefix, } // Concat & Shift -defm VPSHLDV : VBMI2_shift_var<0x70, 0x71, "vpshldv", X86VShldv, SchedWriteVecIMul>; -defm VPSHRDV : VBMI2_shift_var<0x72, 0x73, "vpshrdv", X86VShrdv, SchedWriteVecIMul>; +defm VPSHLDV : VBMI2_shift_var<0x70, 0x71, "vpshldv", fshl, 0, SchedWriteVecIMul>; +defm VPSHRDV : VBMI2_shift_var<0x72, 0x73, "vpshrdv", fshr, 1, SchedWriteVecIMul>; defm VPSHLD : VBMI2_shift_imm<0x70, 0x71, "vpshld", X86VShld, SchedWriteVecIMul>; defm VPSHRD : VBMI2_shift_imm<0x72, 0x73, "vpshrd", X86VShrd, SchedWriteVecIMul>; diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td index b4768590..031fdc1 100644 --- a/llvm/lib/Target/X86/X86InstrArithmetic.td +++ b/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -25,18 +25,12 @@ let SchedRW = [WriteLEA] in { [(set GR32:$dst, lea32addr:$src)]>, OpSize32, Requires<[Not64BitMode]>; - let Predicates = [HasNDD], isCodeGenOnly = 1 in { - def LEA64_8r : I<0x8D, MRMSrcMem, (outs GR8:$dst), (ins lea64_8mem:$src), - "lea{b}\t{$src|$dst}, {$dst|$src}", - [(set GR8:$dst, lea64_iaddr:$src)]>, - OpSize16, - Requires<[In64BitMode]>; - - def LEA64_16r : I<0x8D, MRMSrcMem, (outs GR16:$dst), (ins lea64_16mem:$src), - "lea{w}\t{$src|$dst}, {$dst|$src}", - [(set GR16:$dst, lea64_iaddr:$src)]>, - OpSize16, - Requires<[In64BitMode]>; + let isCodeGenOnly = 1 in { + def LEA64_8r : I<0x8D, MRMSrcMem, (outs GR32:$dst), (ins lea64_8mem:$src), + "lea{l}\t{$src|$dst}, {$dst|$src}", []>, OpSize32; + + def LEA64_16r : I<0x8D, MRMSrcMem, (outs GR32:$dst), (ins lea64_16mem:$src), + "lea{l}\t{$src|$dst}, {$dst|$src}", []>, OpSize32; } def LEA64_32r : I<0x8D, MRMSrcMem, (outs GR32:$dst), (ins lea64_32mem:$src), @@ -51,6 +45,11 @@ let SchedRW = [WriteLEA] in { [(set GR64:$dst, lea64addr:$src)]>; } // SchedRW +let Predicates = [HasNDD] in { + def : Pat<(i8 lea64_iaddr:$src), (EXTRACT_SUBREG (LEA64_8r lea64_8mem:$src), sub_8bit)>; + def : Pat<(i16 lea64_iaddr:$src), (EXTRACT_SUBREG (LEA64_16r lea64_16mem:$src), sub_16bit)>; +} + // Pseudo instruction for lea that prevent optimizer from eliminating // the instruction. let SchedRW = [WriteLEA], isPseudo = true, hasSideEffects = 1 in { diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 0c20ffe..5321ecf 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -406,16 +406,6 @@ def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>; def X86VShld : SDNode<"X86ISD::VSHLD", SDTShuff3OpI>; def X86VShrd : SDNode<"X86ISD::VSHRD", SDTShuff3OpI>; -def X86VShldv : SDNode<"X86ISD::VSHLDV", - SDTypeProfile<1, 3, [SDTCisVec<0>, - SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>, - SDTCisSameAs<0,3>]>>; -def X86VShrdv : SDNode<"X86ISD::VSHRDV", - SDTypeProfile<1, 3, [SDTCisVec<0>, - SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>, - SDTCisSameAs<0,3>]>>; def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>; |