diff options
Diffstat (limited to 'llvm/lib/Target')
135 files changed, 1915 insertions, 1015 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp index e8d3161..ad8368e 100644 --- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp @@ -597,6 +597,14 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) { return Thunk; } +std::optional<std::string> getArm64ECMangledFunctionName(GlobalValue &GV) { + if (!GV.hasName()) { + GV.setName("__unnamed"); + } + + return llvm::getArm64ECMangledFunctionName(GV.getName()); +} + // Builds the "guest exit thunk", a helper to call a function which may or may // not be an exit thunk. (We optimistically assume non-dllimport function // declarations refer to functions defined in AArch64 code; if the linker @@ -608,7 +616,7 @@ Function *AArch64Arm64ECCallLowering::buildGuestExitThunk(Function *F) { getThunkType(F->getFunctionType(), F->getAttributes(), Arm64ECThunkType::GuestExit, NullThunkName, Arm64Ty, X64Ty, ArgTranslations); - auto MangledName = getArm64ECMangledFunctionName(F->getName().str()); + auto MangledName = getArm64ECMangledFunctionName(*F); assert(MangledName && "Can't guest exit to function that's already native"); std::string ThunkName = *MangledName; if (ThunkName[0] == '?' && ThunkName.find("@") != std::string::npos) { @@ -727,9 +735,6 @@ AArch64Arm64ECCallLowering::buildPatchableThunk(GlobalAlias *UnmangledAlias, // Lower an indirect call with inline code. void AArch64Arm64ECCallLowering::lowerCall(CallBase *CB) { - assert(CB->getModule()->getTargetTriple().isOSWindows() && - "Only applicable for Windows targets"); - IRBuilder<> B(CB); Value *CalledOperand = CB->getCalledOperand(); @@ -790,7 +795,7 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) { if (!F) continue; if (std::optional<std::string> MangledName = - getArm64ECMangledFunctionName(A.getName().str())) { + getArm64ECMangledFunctionName(A)) { F->addMetadata("arm64ec_unmangled_name", *MDNode::get(M->getContext(), MDString::get(M->getContext(), A.getName()))); @@ -807,7 +812,7 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) { cast<GlobalValue>(F.getPersonalityFn()->stripPointerCasts()); if (PersFn->getValueType() && PersFn->getValueType()->isFunctionTy()) { if (std::optional<std::string> MangledName = - getArm64ECMangledFunctionName(PersFn->getName().str())) { + getArm64ECMangledFunctionName(*PersFn)) { PersFn->setName(MangledName.value()); } } @@ -821,7 +826,7 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) { // Rename hybrid patchable functions and change callers to use a global // alias instead. if (std::optional<std::string> MangledName = - getArm64ECMangledFunctionName(F.getName().str())) { + getArm64ECMangledFunctionName(F)) { std::string OrigName(F.getName()); F.setName(MangledName.value() + HybridPatchableTargetSuffix); @@ -927,7 +932,7 @@ bool AArch64Arm64ECCallLowering::processFunction( // FIXME: Handle functions with weak linkage? if (!F.hasLocalLinkage() || F.hasAddressTaken()) { if (std::optional<std::string> MangledName = - getArm64ECMangledFunctionName(F.getName().str())) { + getArm64ECMangledFunctionName(F)) { F.addMetadata("arm64ec_unmangled_name", *MDNode::get(M->getContext(), MDString::get(M->getContext(), F.getName()))); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 2b6ea86..018c16d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -28609,14 +28609,16 @@ Value *AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const { void AArch64TargetLowering::insertSSPDeclarations(Module &M) const { // MSVC CRT provides functionalities for stack protection. - if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) { + RTLIB::LibcallImpl SecurityCheckCookieLibcall = + getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE); + if (SecurityCheckCookieLibcall != RTLIB::Unsupported) { // MSVC CRT has a global variable holding security cookie. M.getOrInsertGlobal("__security_cookie", PointerType::getUnqual(M.getContext())); // MSVC CRT has a function to validate security cookie. FunctionCallee SecurityCheckCookie = - M.getOrInsertFunction(Subtarget->getSecurityCheckCookieName(), + M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall), Type::getVoidTy(M.getContext()), PointerType::getUnqual(M.getContext())); if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) { @@ -28637,8 +28639,10 @@ Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const { Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const { // MSVC CRT has a function to validate security cookie. - if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) - return M.getFunction(Subtarget->getSecurityCheckCookieName()); + RTLIB::LibcallImpl SecurityCheckCookieLibcall = + getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE); + if (SecurityCheckCookieLibcall != RTLIB::Unsupported) + return M.getFunction(getLibcallImplName(SecurityCheckCookieLibcall)); return TargetLowering::getSSPStackGuardCheck(M); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index ea63edd8..8887657 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -887,6 +887,10 @@ private: bool shouldScalarizeBinop(SDValue VecOp) const override { return VecOp.getOpcode() == ISD::SETCC; } + + bool hasMultipleConditionRegisters(EVT VT) const override { + return VT.isScalableVector(); + } }; namespace AArch64 { diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index ba7cbcc..5a537f2 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -6484,7 +6484,9 @@ class BaseSIMDThreeSameVectorDot<bit Q, bit U, bits<2> sz, bits<4> opc, string a (OpNode (AccumType RegType:$Rd), (InputType RegType:$Rn), (InputType RegType:$Rm)))]> { - let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}"); + + let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # + "|" # kind1 # "\t$Rd, $Rn, $Rm}"); } multiclass SIMDThreeSameVectorDot<bit U, bit Mixed, string asm, SDPatternOperator OpNode> { @@ -6507,7 +6509,8 @@ class BaseSIMDThreeSameVectorFML<bit Q, bit U, bit b13, bits<3> size, string asm (OpNode (AccumType RegType:$Rd), (InputType RegType:$Rn), (InputType RegType:$Rm)))]> { - let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}"); + let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # + "|" # kind1 # "\t$Rd, $Rn, $Rm}"); let Inst{13} = b13; } @@ -8986,7 +8989,8 @@ class BaseSIMDThreeSameVectorBFDot<bit Q, bit U, string asm, string kind1, (InputType RegType:$Rm)))]> { let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # - ", $Rm" # kind2 # "}"); + ", $Rm" # kind2 # + "|" # kind1 # "\t$Rd, $Rn, $Rm}"); } multiclass SIMDThreeSameVectorBFDot<bit U, string asm> { @@ -9032,7 +9036,7 @@ class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode> [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), (v8bf16 V128:$Rn), (v8bf16 V128:$Rm)))]> { - let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}"); + let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h|.4s\t$Rd, $Rn, $Rm}"); } let mayRaiseFPException = 1, Uses = [FPCR] in @@ -9071,8 +9075,7 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm> (int_aarch64_neon_bfmmla (v4f32 V128:$Rd), (v8bf16 V128:$Rn), (v8bf16 V128:$Rm)))]> { - let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h", - ", $Rm", ".8h", "}"); + let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h|.4s\t$Rd, $Rn, $Rm}"); } let mayRaiseFPException = 1, Uses = [FPCR] in @@ -9143,7 +9146,7 @@ class SIMDThreeSameVectorMatMul<bit B, bit U, string asm, SDPatternOperator OpNo [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]> { - let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b}"; + let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b|.4s\t$Rd, $Rn, $Rm}"; } //---------------------------------------------------------------------------- @@ -13344,8 +13347,8 @@ multiclass AtomicFPStore<bit R, bits<3> op0, string asm> { class BaseSIMDThreeSameVectorFP8MatrixMul<string asm, bits<2> size, string kind> : BaseSIMDThreeSameVectorTied<1, 1, {size, 0}, 0b11101, V128, asm, ".16b", []> { - let AsmString = !strconcat(asm, "{\t$Rd", kind, ", $Rn", ".16b", - ", $Rm", ".16b", "}"); + let AsmString = !strconcat(asm, "{\t$Rd", kind, ", $Rn.16b, $Rm.16b", + "|", kind, "\t$Rd, $Rn, $Rm}"); } multiclass SIMDThreeSameVectorFP8MatrixMul<string asm>{ diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 061ed61..d00e447 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -451,12 +451,6 @@ public: return "__chkstk"; } - const char* getSecurityCheckCookieName() const { - if (isWindowsArm64EC()) - return "#__security_check_cookie_arm64ec"; - return "__security_check_cookie"; - } - /// Choose a method of checking LR before performing a tail call. AArch64PAuth::AuthCheckMethod getAuthenticatedLRCheckMethod(const MachineFunction &MF) const; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index e1adc0b..9f05add 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3092,6 +3092,13 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, return AdjustCost( BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); + // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as + // we use fcvtx under SVE2. Give them invalid costs. + if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() && + ISD == ISD::FP_ROUND && SrcTy.isScalableVector() && + DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64) + return InstructionCost::getInvalid(); + static const TypeConversionCostTblEntry BF16Tbl[] = { {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt @@ -3100,6 +3107,12 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn + {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt + {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt + {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1 + {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt + {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1 + {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp }; if (ST->hasBF16()) @@ -3508,11 +3521,21 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1}, {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3}, + // Truncate from nxvmf32 to nxvmbf16. + {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8}, + {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8}, + {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17}, + // Truncate from nxvmf64 to nxvmf16. {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1}, {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3}, {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7}, + // Truncate from nxvmf64 to nxvmbf16. + {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9}, + {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19}, + {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39}, + // Truncate from nxvmf64 to nxvmf32. {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1}, {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3}, @@ -3523,11 +3546,21 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1}, {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2}, + // Extend from nxvmbf16 to nxvmf32. + {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl + {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl + {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl + // Extend from nxvmf16 to nxvmf64. {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1}, {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2}, {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4}, + // Extend from nxvmbf16 to nxvmf64. + {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt + {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt + {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt + // Extend from nxvmf32 to nxvmf64. {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1}, {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2}, @@ -4282,10 +4315,9 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost( unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, const Instruction *I) const { - int ISD = TLI->InstructionOpcodeToISD(Opcode); // We don't lower some vector selects well that are wider than the register // width. TODO: Improve this with different cost kinds. - if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) { + if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) { // We would need this many instructions to hide the scalarization happening. const int AmortizationCost = 20; @@ -4315,55 +4347,72 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost( return LT.first; } - static const TypeConversionCostTblEntry - VectorSelectTbl[] = { - { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 }, - { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 }, - { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 }, - { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 }, - { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 }, - { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, - { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, - { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, - { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, - { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, - { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } - }; + static const TypeConversionCostTblEntry VectorSelectTbl[] = { + {Instruction::Select, MVT::v2i1, MVT::v2f32, 2}, + {Instruction::Select, MVT::v2i1, MVT::v2f64, 2}, + {Instruction::Select, MVT::v4i1, MVT::v4f32, 2}, + {Instruction::Select, MVT::v4i1, MVT::v4f16, 2}, + {Instruction::Select, MVT::v8i1, MVT::v8f16, 2}, + {Instruction::Select, MVT::v16i1, MVT::v16i16, 16}, + {Instruction::Select, MVT::v8i1, MVT::v8i32, 8}, + {Instruction::Select, MVT::v16i1, MVT::v16i32, 16}, + {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost}, + {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost}, + {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}}; EVT SelCondTy = TLI->getValueType(DL, CondTy); EVT SelValTy = TLI->getValueType(DL, ValTy); if (SelCondTy.isSimple() && SelValTy.isSimple()) { - if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, + if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode, SelCondTy.getSimpleVT(), SelValTy.getSimpleVT())) return Entry->Cost; } } - if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) { - Type *ValScalarTy = ValTy->getScalarType(); - if ((ValScalarTy->isHalfTy() && !ST->hasFullFP16()) || - ValScalarTy->isBFloatTy()) { - auto *ValVTy = cast<FixedVectorType>(ValTy); - - // Without dedicated instructions we promote [b]f16 compares to f32. - auto *PromotedTy = - VectorType::get(Type::getFloatTy(ValTy->getContext()), ValVTy); - - InstructionCost Cost = 0; - // Promote operands to float vectors. - Cost += 2 * getCastInstrCost(Instruction::FPExt, PromotedTy, ValTy, - TTI::CastContextHint::None, CostKind); - // Compare float vectors. + if (Opcode == Instruction::FCmp) { + // Without dedicated instructions we promote f16 + bf16 compares to f32. + if ((!ST->hasFullFP16() && ValTy->getScalarType()->isHalfTy()) || + ValTy->getScalarType()->isBFloatTy()) { + Type *PromotedTy = + ValTy->getWithNewType(Type::getFloatTy(ValTy->getContext())); + InstructionCost Cost = + getCastInstrCost(Instruction::FPExt, PromotedTy, ValTy, + TTI::CastContextHint::None, CostKind); + if (!Op1Info.isConstant() && !Op2Info.isConstant()) + Cost *= 2; Cost += getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred, CostKind, Op1Info, Op2Info); - // During codegen we'll truncate the vector result from i32 to i16. - Cost += - getCastInstrCost(Instruction::Trunc, VectorType::getInteger(ValVTy), - VectorType::getInteger(PromotedTy), - TTI::CastContextHint::None, CostKind); + if (ValTy->isVectorTy()) + Cost += getCastInstrCost( + Instruction::Trunc, VectorType::getInteger(cast<VectorType>(ValTy)), + VectorType::getInteger(cast<VectorType>(PromotedTy)), + TTI::CastContextHint::None, CostKind); return Cost; } + + auto LT = getTypeLegalizationCost(ValTy); + // Model unknown fp compares as a libcall. + if (LT.second.getScalarType() != MVT::f64 && + LT.second.getScalarType() != MVT::f32 && + LT.second.getScalarType() != MVT::f16) + return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy, + {ValTy, ValTy}, CostKind); + + // Some comparison operators require expanding to multiple compares + or. + unsigned Factor = 1; + if (!CondTy->isVectorTy() && + (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ)) + Factor = 2; // fcmp with 2 selects + else if (isa<FixedVectorType>(ValTy) && + (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ || + VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO)) + Factor = 3; // fcmxx+fcmyy+or + else if (isa<ScalableVectorType>(ValTy) && + (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ)) + Factor = 3; // fcmxx+fcmyy+or + + return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first); } // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to @@ -4371,7 +4420,7 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost( // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds // providing it will not cause performance regressions. if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() && - ISD == ISD::SETCC && I && !CmpInst::isUnsigned(VecPred) && + Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) && TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) && match(I->getOperand(0), m_And(m_Value(), m_Value()))) { if (match(I->getOperand(1), m_Zero())) @@ -6235,10 +6284,17 @@ bool AArch64TTIImpl::isProfitableToSinkOperands( } } - auto ShouldSinkCondition = [](Value *Cond) -> bool { + auto ShouldSinkCondition = [](Value *Cond, + SmallVectorImpl<Use *> &Ops) -> bool { + if (!isa<IntrinsicInst>(Cond)) + return false; auto *II = dyn_cast<IntrinsicInst>(Cond); - return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or && - isa<ScalableVectorType>(II->getOperand(0)->getType()); + if (II->getIntrinsicID() != Intrinsic::vector_reduce_or || + !isa<ScalableVectorType>(II->getOperand(0)->getType())) + return false; + if (isa<CmpInst>(II->getOperand(0))) + Ops.push_back(&II->getOperandUse(0)); + return true; }; switch (I->getOpcode()) { @@ -6254,7 +6310,7 @@ bool AArch64TTIImpl::isProfitableToSinkOperands( } break; case Instruction::Select: { - if (!ShouldSinkCondition(I->getOperand(0))) + if (!ShouldSinkCondition(I->getOperand(0), Ops)) return false; Ops.push_back(&I->getOperandUse(0)); @@ -6264,7 +6320,7 @@ bool AArch64TTIImpl::isProfitableToSinkOperands( if (cast<BranchInst>(I)->isUnconditional()) return false; - if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition())) + if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition(), Ops)) return false; Ops.push_back(&I->getOperandUse(0)); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 6912caf..7a2b679 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -79,8 +79,7 @@ public: } void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value) const override; @@ -421,9 +420,8 @@ static bool shouldForceRelocation(const MCFixup &Fixup) { } void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (shouldForceRelocation(Fixup)) IsResolved = false; maybeAddReloc(F, Fixup, Target, Value, IsResolved); @@ -460,8 +458,8 @@ void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Shift the value into position. Value <<= Info.TargetOffset; - unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // Used to point to big endian bytes. unsigned FulleSizeInBytes = getFixupKindContainereSizeInBytes(Fixup.getKind()); @@ -471,15 +469,16 @@ void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, if (FulleSizeInBytes == 0) { // Handle as little-endian for (unsigned i = 0; i != NumBytes; ++i) { - Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); + Data[i] |= uint8_t((Value >> (i * 8)) & 0xff); } } else { // Handle as big-endian - assert((Offset + FulleSizeInBytes) <= Data.size() && "Invalid fixup size!"); + assert(Fixup.getOffset() + FulleSizeInBytes <= F.getSize() && + "Invalid fixup size!"); assert(NumBytes <= FulleSizeInBytes && "Invalid fixup size!"); for (unsigned i = 0; i != NumBytes; ++i) { unsigned Idx = FulleSizeInBytes - 1 - i; - Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff); + Data[Idx] |= uint8_t((Value >> (i * 8)) & 0xff); } } @@ -492,9 +491,9 @@ void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // If the immediate is negative, generate MOVN else MOVZ. // (Bit 30 = 0) ==> MOVN, (Bit 30 = 1) ==> MOVZ. if (SignedValue < 0) - Data[Offset + 3] &= ~(1 << 6); + Data[3] &= ~(1 << 6); else - Data[Offset + 3] |= (1 << 6); + Data[3] |= (1 << 6); } } diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index 7618a57..45ac023 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -96,8 +96,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(const MCFixup &Fixup, case AArch64::S_TPREL: case AArch64::S_TLSDESC: case AArch64::S_TLSDESC_AUTH: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; default: break; @@ -488,7 +488,8 @@ bool AArch64ELFObjectWriter::needsRelocateWithSymbol(const MCValue &Val, // this global needs to be tagged. In addition, the linker needs to know // whether to emit a special addend when relocating `end` symbols, and this // can only be determined by the attributes of the symbol itself. - if (Val.getAddSym() && cast<MCSymbolELF>(Val.getAddSym())->isMemtag()) + if (Val.getAddSym() && + static_cast<const MCSymbolELF *>(Val.getAddSym())->isMemtag()) return true; if ((Val.getSpecifier() & AArch64::S_GOT) == AArch64::S_GOT) diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 6257e99..14547e3 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -418,7 +418,8 @@ private: } MCSymbol *emitMappingSymbol(StringRef Name) { - auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name)); + auto *Symbol = + static_cast<MCSymbolELF *>(getContext().createLocalSymbol(Name)); emitLabel(Symbol); return Symbol; } @@ -455,7 +456,7 @@ void AArch64TargetELFStreamer::emitInst(uint32_t Inst) { void AArch64TargetELFStreamer::emitDirectiveVariantPCS(MCSymbol *Symbol) { getStreamer().getAssembler().registerSymbol(*Symbol); - cast<MCSymbolELF>(Symbol)->setOther(ELF::STO_AARCH64_VARIANT_PCS); + static_cast<MCSymbolELF *>(Symbol)->setOther(ELF::STO_AARCH64_VARIANT_PCS); } void AArch64TargetELFStreamer::finish() { @@ -541,7 +542,7 @@ void AArch64TargetELFStreamer::finish() { MCSectionELF *MemtagSec = nullptr; for (const MCSymbol &Symbol : Asm.symbols()) { - const auto &Sym = cast<MCSymbolELF>(Symbol); + auto &Sym = static_cast<const MCSymbolELF &>(Symbol); if (Sym.isMemtag()) { MemtagSec = Ctx.getELFSection(".memtag.globals.static", ELF::SHT_AARCH64_MEMTAG_GLOBALS_STATIC, 0); @@ -556,7 +557,7 @@ void AArch64TargetELFStreamer::finish() { S.switchSection(MemtagSec); const auto *Zero = MCConstantExpr::create(0, Ctx); for (const MCSymbol &Symbol : Asm.symbols()) { - const auto &Sym = cast<MCSymbolELF>(Symbol); + auto &Sym = static_cast<const MCSymbolELF &>(Symbol); if (!Sym.isMemtag()) continue; auto *SRE = MCSymbolRefExpr::create(&Sym, Ctx); diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 8a0c4ac..d84f512 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1160,6 +1160,12 @@ def FeatureTanhInsts : SubtargetFeature<"tanh-insts", "Has v_tanh_f32/f16 instructions" >; +def FeatureTensorCvtLutInsts : SubtargetFeature<"tensor-cvt-lut-insts", + "HasTensorCvtLutInsts", + "true", + "Has v_perm_pk16* instructions" +>; + def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts", "HasTransposeLoadF4F6Insts", "true", @@ -1359,6 +1365,13 @@ def FeatureXF32Insts : SubtargetFeature<"xf32-insts", "v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32" >; +def FeatureGloballyAddressableScratch : SubtargetFeature< + "globally-addressable-scratch", + "HasGloballyAddressableScratch", + "true", + "FLAT instructions can access scratch memory for any thread in any wave" +>; + // FIXME: Remove after all users are migrated to attribute. def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr", "DynamicVGPR", @@ -2030,6 +2043,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureDPPSrc1SGPR, FeatureBitOp3Insts, FeatureTanhInsts, + FeatureTensorCvtLutInsts, FeatureTransposeLoadF4F6Insts, FeatureBF16TransInsts, FeatureBF16ConversionInsts, @@ -2048,6 +2062,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureAtomicFMinFMaxF64FlatInsts, FeatureFlatBufferGlobalAtomicFaddF64Inst, FeatureMemoryAtomicFAddF32DenormalSupport, + FeatureGloballyAddressableScratch, FeatureKernargPreload, FeatureVmemPrefInsts, FeatureLshlAddU64Inst, @@ -2785,6 +2800,9 @@ def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">, def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">, AssemblerPredicate<(all_of FeatureTanhInsts)>; +def HasTensorCvtLutInsts : Predicate<"Subtarget->hasTensorCvtLutInsts()">, + AssemblerPredicate<(all_of FeatureTensorCvtLutInsts)>; + def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">, AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 992572f..394a143 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -51,18 +51,6 @@ def gi_vop3pmodsdot : GIComplexOperandMatcher<s32, "selectVOP3PModsDOT">, GIComplexPatternEquiv<VOP3PModsDOT>; -def gi_vop3pmodsneg : - GIComplexOperandMatcher<s32, "selectVOP3PModsNeg">, - GIComplexPatternEquiv<VOP3PModsNeg>; - -def gi_vop3pmodsnegs : - GIComplexOperandMatcher<s32, "selectVOP3PModsNegs">, - GIComplexPatternEquiv<VOP3PModsNegs>; - -def gi_dotiuvop3pmodsnegabs : - GIComplexOperandMatcher<s32, "selectVOP3PModsNegAbs">, - GIComplexPatternEquiv<VOP3PModsNegAbs>; - def gi_wmmaopselvop3pmods : GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">, GIComplexPatternEquiv<WMMAOpSelVOP3PMods>; @@ -452,6 +440,13 @@ def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">, def gi_as_hw_round_mode : GICustomOperandRenderer<"renderRoundMode">, GISDNodeXFormEquiv<as_hw_round_mode>; +def gi_VOP3PModsNeg : GICustomOperandRenderer<"renderVOP3PModsNeg">, + GISDNodeXFormEquiv<VOP3PModsNeg>; +def gi_VOP3PModsNegs : GICustomOperandRenderer<"renderVOP3PModsNegs">, + GISDNodeXFormEquiv<VOP3PModsNegs>; +def gi_VOP3PModsNegAbs : GICustomOperandRenderer<"renderVOP3PModsNegAbs">, + GISDNodeXFormEquiv<VOP3PModsNegAbs>; + def gi_prefetch_loc : GICustomOperandRenderer<"renderPrefetchLoc">, GISDNodeXFormEquiv<PrefetchLoc>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 39b4200..fb83388 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -3449,63 +3449,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src, return SelectVOP3PMods(In, Src, SrcMods, true); } -// Select neg_lo from the i1 immediate operand. -bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const { - const ConstantSDNode *C = cast<ConstantSDNode>(In); - // Literal i1 value set in intrinsic, represents SrcMods for the next operand. - // 1 promotes packed values to signed, 0 treats them as unsigned. - assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value"); - - unsigned Mods = SISrcMods::OP_SEL_1; - unsigned SrcSign = C->getZExtValue(); - if (SrcSign == 1) - Mods ^= SISrcMods::NEG; - - Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); - return true; -} - -// Select both neg_lo and neg_hi from the i1 immediate operand. This is -// specifically for F16/BF16 operands in WMMA instructions, where neg_lo applies -// to matrix's even k elements, and neg_hi applies to matrix's odd k elements. -bool AMDGPUDAGToDAGISel::SelectVOP3PModsNegs(SDValue In, SDValue &Src) const { - const ConstantSDNode *C = cast<ConstantSDNode>(In); - // Literal i1 value set in intrinsic, represents SrcMods for the next operand. - // 1 promotes packed values to signed, 0 treats them as unsigned. - assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value"); - - unsigned Mods = SISrcMods::OP_SEL_1; - unsigned SrcSign = C->getZExtValue(); - if (SrcSign == 1) - Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); - - Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); - return true; -} - -// Select neg, abs, or both neg and abs from the i16 immediate operans. -bool AMDGPUDAGToDAGISel::SelectVOP3PModsNegAbs(SDValue In, SDValue &Src) const { - const ConstantSDNode *C = cast<ConstantSDNode>(In); - unsigned Mods = SISrcMods::OP_SEL_1; - unsigned SrcMod = C->getZExtValue(); - switch (SrcMod) { - default: // Any other value will be silently ignored (considered as 0). - break; - case 1: - Mods ^= SISrcMods::NEG; - break; - case 2: - Mods ^= SISrcMods::ABS; - break; - case 3: - Mods ^= (SISrcMods::NEG | SISrcMods::ABS); - break; - } - - Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); - return true; -} - bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const { const ConstantSDNode *C = cast<ConstantSDNode>(In); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 983f1aa..16388e7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -241,9 +241,6 @@ private: bool IsDOT = false) const; bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const; - bool SelectVOP3PModsNeg(SDValue In, SDValue &Src) const; - bool SelectVOP3PModsNegs(SDValue In, SDValue &Src) const; - bool SelectVOP3PModsNegAbs(SDValue In, SDValue &Src) const; bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const; bool SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 31c4f62..64e68ab 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -589,14 +589,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setSchedulingPreference(Sched::RegPressure); setJumpIsExpensive(true); - // FIXME: This is only partially true. If we have to do vector compares, any - // SGPR pair can be a condition register. If we have a uniform condition, we - // are better off doing SALU operations, where there is only one SCC. For now, - // we don't have a way of knowing during instruction selection if a condition - // will be uniform and we always use vector compares. Assume we are using - // vector compares until that is fixed. - setHasMultipleConditionRegisters(true); - setMinCmpXchgSizeInBits(32); setSupportsUnalignedAtomics(false); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 39bb0ad..fd5d5b8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -388,6 +388,16 @@ public: MVT getFenceOperandTy(const DataLayout &DL) const override { return MVT::i32; } + + bool hasMultipleConditionRegisters(EVT VT) const override { + // FIXME: This is only partially true. If we have to do vector compares, any + // SGPR pair can be a condition register. If we have a uniform condition, we + // are better off doing SALU operations, where there is only one SCC. For + // now, we don't have a way of knowing during instruction selection if a + // condition will be uniform and we always use vector compares. Assume we + // are using vector compares until that is fixed. + return true; + } }; namespace AMDGPUISD { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index f2207ff..4fe5d00 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1694,7 +1694,9 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { NewII->takeName(&II); return IC.replaceInstUsesWith(II, NewII); } - case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: { + case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: + case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4: + case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: { Value *Src0 = II.getArgOperand(1); Value *Src1 = II.getArgOperand(3); unsigned FmtA = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index b0d3b12..b7fd131 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4988,66 +4988,6 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { return selectVOP3PRetHelper(Root, true); } -// Select neg_lo from the i1 immediate operand. -InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const { - // Literal i1 value set in intrinsic, represents SrcMods for the next operand. - // Value is in Imm operand as i1 sign extended to int64_t. - // 1(-1) promotes packed values to signed, 0 treats them as unsigned. - assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) && - "expected i1 value"); - unsigned Mods = SISrcMods::OP_SEL_1; - if (Root.getImm() == -1) - Mods ^= SISrcMods::NEG; - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods - }}; -} - -// Select both neg_lo and neg_hi from the i1 immediate operand. This is -// specifically for F16/BF16 operands in WMMA instructions, where neg_lo applies -// to matrix's even k elements, and neg_hi applies to matrix's odd k elements. -InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectVOP3PModsNegs(MachineOperand &Root) const { - // Literal i1 value set in intrinsic, represents SrcMods for the next operand. - // Value is in Imm operand as i1 sign extended to int64_t. - // 1(-1) promotes packed values to signed, 0 treats them as unsigned. - assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) && - "expected i1 value"); - unsigned Mods = SISrcMods::OP_SEL_1; - if (Root.getImm() == -1) - Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods - }}; -} - -// Select neg, abs, or both neg and abs from the i16 immediate operans. -InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectVOP3PModsNegAbs(MachineOperand &Root) const { - - assert(Root.isImm() && "Modifier for C must be an immediate"); - - unsigned Mods = SISrcMods::OP_SEL_1; - switch (Root.getImm()) { - default: // Any other value will be silently ignored (considered as 0). - break; - case 1: - Mods ^= SISrcMods::NEG; - break; - case 2: - Mods ^= SISrcMods::ABS; - break; - case 3: - Mods ^= (SISrcMods::NEG | SISrcMods::ABS); - break; - } - - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods - }}; -} - InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods( MachineOperand &Root) const { @@ -7102,6 +7042,38 @@ void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB, MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4); } +void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + unsigned Mods = SISrcMods::OP_SEL_1; + if (MI.getOperand(OpIdx).getImm()) + Mods ^= SISrcMods::NEG; + MIB.addImm((int64_t)Mods); +} + +void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + unsigned Mods = SISrcMods::OP_SEL_1; + if (MI.getOperand(OpIdx).getImm()) + Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); + MIB.addImm((int64_t)Mods); +} + +void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + unsigned Val = MI.getOperand(OpIdx).getImm(); + unsigned Mods = SISrcMods::OP_SEL_1; // default: none + if (Val == 1) // neg + Mods ^= SISrcMods::NEG; + if (Val == 2) // abs + Mods ^= SISrcMods::ABS; + if (Val == 3) // neg and abs + Mods ^= (SISrcMods::NEG | SISrcMods::ABS); + MIB.addImm((int64_t)Mods); +} + void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 140e753..c9da419 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -200,13 +200,6 @@ private: selectVOP3PModsDOT(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns - selectVOP3PModsNeg(MachineOperand &Root) const; - InstructionSelector::ComplexRendererFns - selectVOP3PModsNegs(MachineOperand &Root) const; - InstructionSelector::ComplexRendererFns - selectVOP3PModsNegAbs(MachineOperand &Root) const; - - InstructionSelector::ComplexRendererFns selectWMMAOpSelVOP3PMods(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns @@ -419,6 +412,13 @@ private: void renderRoundMode(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; + void renderVOP3PModsNeg(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; + void renderVOP3PModsNegs(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; + void renderVOP3PModsNegAbs(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; + void renderPrefetchLoc(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 1fdf272..a6e4a63 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2271,6 +2271,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture( const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) ? AMDGPU::SRC_SHARED_BASE : AMDGPU::SRC_PRIVATE_BASE; + assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE || + !ST.hasGloballyAddressableScratch()) && + "Cannot use src_private_base with globally addressable scratch!"); // FIXME: It would be more natural to emit a COPY here, but then copy // coalescing would kick in and it would think it's okay to use the "HI" // subregister (instead of extracting the HI 32 bits) which is an artificial @@ -2396,11 +2399,30 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( if (SrcAS == AMDGPUAS::FLAT_ADDRESS && (DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { + auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register { + if (DestAS == AMDGPUAS::PRIVATE_ADDRESS && + ST.hasGloballyAddressableScratch()) { + // flat -> private with globally addressable scratch: subtract + // src_flat_scratch_base_lo. + const LLT S32 = LLT::scalar(32); + Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0); + Register FlatScratchBaseLo = + B.buildInstr(AMDGPU::S_MOV_B32, {S32}, + {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)}) + .getReg(0); + MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass); + Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0); + return B.buildIntToPtr(Dst, Sub).getReg(0); + } + + // Extract low 32-bits of the pointer. + return B.buildExtract(Dst, Src, 0).getReg(0); + }; + // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for // G_ADDRSPACE_CAST we need to guess. if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) { - // Extract low 32-bits of the pointer. - B.buildExtract(Dst, Src, 0); + castFlatToLocalOrPrivate(Dst); MI.eraseFromParent(); return true; } @@ -2411,7 +2433,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( auto FlatNull = B.buildConstant(SrcTy, 0); // Extract low 32-bits of the pointer. - auto PtrLo32 = B.buildExtract(DstTy, Src, 0); + auto PtrLo32 = castFlatToLocalOrPrivate(DstTy); auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); @@ -2425,14 +2447,45 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( (SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register { - Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); - if (!ApertureReg.isValid()) - return false; - // Coerce the type of the low half of the result so we can use // merge_values. Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); + if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS && + ST.hasGloballyAddressableScratch()) { + // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr + // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr + Register AllOnes = B.buildConstant(S32, -1).getReg(0); + Register ThreadID = B.buildConstant(S32, 0).getReg(0); + ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32}) + .addUse(AllOnes) + .addUse(ThreadID) + .getReg(0); + if (ST.isWave64()) { + ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32}) + .addUse(AllOnes) + .addUse(ThreadID) + .getReg(0); + } + Register ShAmt = + B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0); + Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0); + Register CvtPtr = + B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0); + // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full + // 64-bit hi:lo value. + Register FlatScratchBase = + B.buildInstr(AMDGPU::S_MOV_B64, {S64}, + {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)}) + .getReg(0); + MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass); + return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0); + } + + Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); + if (!ApertureReg.isValid()) + return false; + // TODO: Should we allow mismatched types but matching sizes in merges to // avoid the ptrtoint? return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0); @@ -5788,11 +5841,25 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const { - Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); - auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); + const LLT S32 = LLT::scalar(32); + auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg()); Register Hi32 = Unmerge.getReg(1); - B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); + if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS && + ST.hasGloballyAddressableScratch()) { + Register FlatScratchBaseHi = + B.buildInstr(AMDGPU::S_MOV_B32, {S32}, + {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)}) + .getReg(0); + MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass); + // Test bits 63..58 against the aperture address. + Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0); + B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR, + B.buildConstant(S32, 1u << 26)); + } else { + Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); + B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); + } MI.eraseFromParent(); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index d443f4e..2d8f259 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -236,7 +236,7 @@ cl::opt<LoweringKind> LoweringKindLoc( "Lower via mixture of above strategies"))); template <typename T> std::vector<T> sortByName(std::vector<T> &&V) { - llvm::sort(V.begin(), V.end(), [](const auto *L, const auto *R) { + llvm::sort(V, [](const auto *L, const auto *R) { return L->getName() < R->getName(); }); return {std::move(V)}; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 5aa0ebf..868b1a2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4603,6 +4603,42 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp8: case Intrinsic::amdgcn_cvt_scale_pk8_f32_bf8: case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp4: + case Intrinsic::amdgcn_cvt_scale_pk16_f16_fp6: + case Intrinsic::amdgcn_cvt_scale_pk16_bf16_fp6: + case Intrinsic::amdgcn_cvt_scale_pk16_f16_bf6: + case Intrinsic::amdgcn_cvt_scale_pk16_bf16_bf6: + case Intrinsic::amdgcn_cvt_scale_pk16_f32_fp6: + case Intrinsic::amdgcn_cvt_scale_pk16_f32_bf6: + case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_bf16: + case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_bf16: + case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f16: + case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f16: + case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f32: + case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f32: + case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f32: + case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f16: + case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_bf16: + case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f32: + case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f32: + case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f16: + case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f16: + case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_bf16: + case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_bf16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_bf16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_bf16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f32: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f32: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f32: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_bf16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f32: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f32: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_bf16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_bf16: case Intrinsic::amdgcn_sat_pk4_i4_i8: case Intrinsic::amdgcn_sat_pk4_u4_u8: case Intrinsic::amdgcn_fmed3: @@ -4762,7 +4798,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8: case Intrinsic::amdgcn_wmma_i32_16x16x64_iu8: case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: + case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4: + case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: case Intrinsic::amdgcn_wmma_f32_32x16x128_f4: + case Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4: + case Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4: case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16: case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16: case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16: @@ -4777,6 +4817,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8: case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: + case Intrinsic::amdgcn_perm_pk16_b4_u4: + case Intrinsic::amdgcn_perm_pk16_b6_u4: + case Intrinsic::amdgcn_perm_pk16_b8_u4: return getDefaultMappingVOP(MI); case Intrinsic::amdgcn_log: case Intrinsic::amdgcn_exp2: diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index a83caa0..ff8efd2 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -178,6 +178,10 @@ public: ImmTyBitOp3, ImmTyMatrixAFMT, ImmTyMatrixBFMT, + ImmTyMatrixAScale, + ImmTyMatrixBScale, + ImmTyMatrixAScaleFmt, + ImmTyMatrixBScaleFmt, ImmTyMatrixAReuse, ImmTyMatrixBReuse, ImmTyScaleSel, @@ -428,6 +432,10 @@ public: bool isIndexKey32bit() const { return isImmTy(ImmTyIndexKey32bit); } bool isMatrixAFMT() const { return isImmTy(ImmTyMatrixAFMT); } bool isMatrixBFMT() const { return isImmTy(ImmTyMatrixBFMT); } + bool isMatrixAScale() const { return isImmTy(ImmTyMatrixAScale); } + bool isMatrixBScale() const { return isImmTy(ImmTyMatrixBScale); } + bool isMatrixAScaleFmt() const { return isImmTy(ImmTyMatrixAScaleFmt); } + bool isMatrixBScaleFmt() const { return isImmTy(ImmTyMatrixBScaleFmt); } bool isMatrixAReuse() const { return isImmTy(ImmTyMatrixAReuse); } bool isMatrixBReuse() const { return isImmTy(ImmTyMatrixBReuse); } bool isTFE() const { return isImmTy(ImmTyTFE); } @@ -1183,6 +1191,10 @@ public: case ImmTyBitOp3: OS << "BitOp3"; break; case ImmTyMatrixAFMT: OS << "ImmTyMatrixAFMT"; break; case ImmTyMatrixBFMT: OS << "ImmTyMatrixBFMT"; break; + case ImmTyMatrixAScale: OS << "ImmTyMatrixAScale"; break; + case ImmTyMatrixBScale: OS << "ImmTyMatrixBScale"; break; + case ImmTyMatrixAScaleFmt: OS << "ImmTyMatrixAScaleFmt"; break; + case ImmTyMatrixBScaleFmt: OS << "ImmTyMatrixBScaleFmt"; break; case ImmTyMatrixAReuse: OS << "ImmTyMatrixAReuse"; break; case ImmTyMatrixBReuse: OS << "ImmTyMatrixBReuse"; break; case ImmTyScaleSel: OS << "ScaleSel" ; break; @@ -1608,6 +1620,10 @@ public: return getFeatureBits()[AMDGPU::FeaturePartialNSAEncoding]; } + bool hasGloballyAddressableScratch() const { + return getFeatureBits()[AMDGPU::FeatureGloballyAddressableScratch]; + } + unsigned getNSAMaxSize(bool HasSampler = false) const { return AMDGPU::getNSAMaxSize(getSTI(), HasSampler); } @@ -1728,6 +1744,14 @@ public: AMDGPUOperand::ImmTy Type); ParseStatus parseMatrixAFMT(OperandVector &Operands); ParseStatus parseMatrixBFMT(OperandVector &Operands); + ParseStatus tryParseMatrixScale(OperandVector &Operands, StringRef Name, + AMDGPUOperand::ImmTy Type); + ParseStatus parseMatrixAScale(OperandVector &Operands); + ParseStatus parseMatrixBScale(OperandVector &Operands); + ParseStatus tryParseMatrixScaleFmt(OperandVector &Operands, StringRef Name, + AMDGPUOperand::ImmTy Type); + ParseStatus parseMatrixAScaleFmt(OperandVector &Operands); + ParseStatus parseMatrixBScaleFmt(OperandVector &Operands); ParseStatus parseDfmtNfmt(int64_t &Format); ParseStatus parseUfmt(int64_t &Format); @@ -2739,46 +2763,48 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) { static MCRegister getSpecialRegForName(StringRef RegName) { return StringSwitch<unsigned>(RegName) - .Case("exec", AMDGPU::EXEC) - .Case("vcc", AMDGPU::VCC) - .Case("flat_scratch", AMDGPU::FLAT_SCR) - .Case("xnack_mask", AMDGPU::XNACK_MASK) - .Case("shared_base", AMDGPU::SRC_SHARED_BASE) - .Case("src_shared_base", AMDGPU::SRC_SHARED_BASE) - .Case("shared_limit", AMDGPU::SRC_SHARED_LIMIT) - .Case("src_shared_limit", AMDGPU::SRC_SHARED_LIMIT) - .Case("private_base", AMDGPU::SRC_PRIVATE_BASE) - .Case("src_private_base", AMDGPU::SRC_PRIVATE_BASE) - .Case("private_limit", AMDGPU::SRC_PRIVATE_LIMIT) - .Case("src_private_limit", AMDGPU::SRC_PRIVATE_LIMIT) - .Case("pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID) - .Case("src_pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID) - .Case("lds_direct", AMDGPU::LDS_DIRECT) - .Case("src_lds_direct", AMDGPU::LDS_DIRECT) - .Case("m0", AMDGPU::M0) - .Case("vccz", AMDGPU::SRC_VCCZ) - .Case("src_vccz", AMDGPU::SRC_VCCZ) - .Case("execz", AMDGPU::SRC_EXECZ) - .Case("src_execz", AMDGPU::SRC_EXECZ) - .Case("scc", AMDGPU::SRC_SCC) - .Case("src_scc", AMDGPU::SRC_SCC) - .Case("tba", AMDGPU::TBA) - .Case("tma", AMDGPU::TMA) - .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) - .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) - .Case("xnack_mask_lo", AMDGPU::XNACK_MASK_LO) - .Case("xnack_mask_hi", AMDGPU::XNACK_MASK_HI) - .Case("vcc_lo", AMDGPU::VCC_LO) - .Case("vcc_hi", AMDGPU::VCC_HI) - .Case("exec_lo", AMDGPU::EXEC_LO) - .Case("exec_hi", AMDGPU::EXEC_HI) - .Case("tma_lo", AMDGPU::TMA_LO) - .Case("tma_hi", AMDGPU::TMA_HI) - .Case("tba_lo", AMDGPU::TBA_LO) - .Case("tba_hi", AMDGPU::TBA_HI) - .Case("pc", AMDGPU::PC_REG) - .Case("null", AMDGPU::SGPR_NULL) - .Default(AMDGPU::NoRegister); + .Case("exec", AMDGPU::EXEC) + .Case("vcc", AMDGPU::VCC) + .Case("flat_scratch", AMDGPU::FLAT_SCR) + .Case("xnack_mask", AMDGPU::XNACK_MASK) + .Case("shared_base", AMDGPU::SRC_SHARED_BASE) + .Case("src_shared_base", AMDGPU::SRC_SHARED_BASE) + .Case("shared_limit", AMDGPU::SRC_SHARED_LIMIT) + .Case("src_shared_limit", AMDGPU::SRC_SHARED_LIMIT) + .Case("private_base", AMDGPU::SRC_PRIVATE_BASE) + .Case("src_private_base", AMDGPU::SRC_PRIVATE_BASE) + .Case("private_limit", AMDGPU::SRC_PRIVATE_LIMIT) + .Case("src_private_limit", AMDGPU::SRC_PRIVATE_LIMIT) + .Case("src_flat_scratch_base_lo", AMDGPU::SRC_FLAT_SCRATCH_BASE_LO) + .Case("src_flat_scratch_base_hi", AMDGPU::SRC_FLAT_SCRATCH_BASE_HI) + .Case("pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID) + .Case("src_pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID) + .Case("lds_direct", AMDGPU::LDS_DIRECT) + .Case("src_lds_direct", AMDGPU::LDS_DIRECT) + .Case("m0", AMDGPU::M0) + .Case("vccz", AMDGPU::SRC_VCCZ) + .Case("src_vccz", AMDGPU::SRC_VCCZ) + .Case("execz", AMDGPU::SRC_EXECZ) + .Case("src_execz", AMDGPU::SRC_EXECZ) + .Case("scc", AMDGPU::SRC_SCC) + .Case("src_scc", AMDGPU::SRC_SCC) + .Case("tba", AMDGPU::TBA) + .Case("tma", AMDGPU::TMA) + .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) + .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) + .Case("xnack_mask_lo", AMDGPU::XNACK_MASK_LO) + .Case("xnack_mask_hi", AMDGPU::XNACK_MASK_HI) + .Case("vcc_lo", AMDGPU::VCC_LO) + .Case("vcc_hi", AMDGPU::VCC_HI) + .Case("exec_lo", AMDGPU::EXEC_LO) + .Case("exec_hi", AMDGPU::EXEC_HI) + .Case("tma_lo", AMDGPU::TMA_LO) + .Case("tma_hi", AMDGPU::TMA_HI) + .Case("tba_lo", AMDGPU::TBA_LO) + .Case("tba_hi", AMDGPU::TBA_HI) + .Case("pc", AMDGPU::PC_REG) + .Case("null", AMDGPU::SGPR_NULL) + .Default(AMDGPU::NoRegister); } bool AMDGPUAsmParser::ParseRegister(MCRegister &RegNo, SMLoc &StartLoc, @@ -6724,6 +6750,9 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, case SRC_PRIVATE_LIMIT_LO: case SRC_PRIVATE_LIMIT: return isGFX9Plus(); + case SRC_FLAT_SCRATCH_BASE_LO: + case SRC_FLAT_SCRATCH_BASE_HI: + return hasGloballyAddressableScratch(); case SRC_POPS_EXITING_WAVE_ID: return isGFX9Plus() && !isGFX11Plus(); case TBA: @@ -7356,6 +7385,42 @@ ParseStatus AMDGPUAsmParser::parseMatrixBFMT(OperandVector &Operands) { AMDGPUOperand::ImmTyMatrixBFMT); } +ParseStatus AMDGPUAsmParser::tryParseMatrixScale(OperandVector &Operands, + StringRef Name, + AMDGPUOperand::ImmTy Type) { + return parseStringOrIntWithPrefix( + Operands, Name, {"MATRIX_SCALE_ROW0", "MATRIX_SCALE_ROW1"}, Type); +} + +ParseStatus AMDGPUAsmParser::parseMatrixAScale(OperandVector &Operands) { + return tryParseMatrixScale(Operands, "matrix_a_scale", + AMDGPUOperand::ImmTyMatrixAScale); +} + +ParseStatus AMDGPUAsmParser::parseMatrixBScale(OperandVector &Operands) { + return tryParseMatrixScale(Operands, "matrix_b_scale", + AMDGPUOperand::ImmTyMatrixBScale); +} + +ParseStatus AMDGPUAsmParser::tryParseMatrixScaleFmt(OperandVector &Operands, + StringRef Name, + AMDGPUOperand::ImmTy Type) { + return parseStringOrIntWithPrefix( + Operands, Name, + {"MATRIX_SCALE_FMT_E8", "MATRIX_SCALE_FMT_E5M3", "MATRIX_SCALE_FMT_E4M3"}, + Type); +} + +ParseStatus AMDGPUAsmParser::parseMatrixAScaleFmt(OperandVector &Operands) { + return tryParseMatrixScaleFmt(Operands, "matrix_a_scale_fmt", + AMDGPUOperand::ImmTyMatrixAScaleFmt); +} + +ParseStatus AMDGPUAsmParser::parseMatrixBScaleFmt(OperandVector &Operands) { + return tryParseMatrixScaleFmt(Operands, "matrix_b_scale_fmt", + AMDGPUOperand::ImmTyMatrixBScaleFmt); +} + // dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their // values to live in a joint format operand in the MCInst encoding. ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) { @@ -9489,6 +9554,34 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, AMDGPUOperand::ImmTyMatrixBFMT, 0); } + int MatrixAScaleIdx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_a_scale); + if (MatrixAScaleIdx != -1) { + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyMatrixAScale, 0); + } + + int MatrixBScaleIdx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_b_scale); + if (MatrixBScaleIdx != -1) { + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyMatrixBScale, 0); + } + + int MatrixAScaleFmtIdx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_a_scale_fmt); + if (MatrixAScaleFmtIdx != -1) { + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyMatrixAScaleFmt, 0); + } + + int MatrixBScaleFmtIdx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_b_scale_fmt); + if (MatrixBScaleFmtIdx != -1) { + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyMatrixBScaleFmt, 0); + } + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::matrix_a_reuse)) addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyMatrixAReuse, 0); diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index ffe6b06..fb7d634 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -598,6 +598,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2 // encodings + if (isGFX1250() && Bytes.size() >= 16) { + DecoderUInt128 DecW = eat16Bytes(Bytes); + if (tryDecodeInst(DecoderTableGFX1250128, MI, DecW, Address, CS)) + break; + Bytes = Bytes_.slice(0, MaxInstBytesNum); + } + if (isGFX11Plus() && Bytes.size() >= 12 ) { DecoderUInt128 DecW = eat12Bytes(Bytes); @@ -1907,6 +1914,8 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const { return isGFX11Plus() ? createRegOperand(M0) : createRegOperand(SGPR_NULL); case 126: return createRegOperand(EXEC_LO); case 127: return createRegOperand(EXEC_HI); + case 230: return createRegOperand(SRC_FLAT_SCRATCH_BASE_LO); + case 231: return createRegOperand(SRC_FLAT_SCRATCH_BASE_HI); case 235: return createRegOperand(SRC_SHARED_BASE_LO); case 236: return createRegOperand(SRC_SHARED_LIMIT_LO); case 237: return createRegOperand(SRC_PRIVATE_BASE_LO); @@ -1940,6 +1949,7 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const { return createRegOperand(SGPR_NULL); break; case 126: return createRegOperand(EXEC); + case 230: return createRegOperand(SRC_FLAT_SCRATCH_BASE_LO); case 235: return createRegOperand(SRC_SHARED_BASE); case 236: return createRegOperand(SRC_SHARED_LIMIT); case 237: return createRegOperand(SRC_PRIVATE_BASE); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 6fe3abc..5530886 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -236,6 +236,7 @@ protected: bool Has64BitLiterals = false; bool HasBitOp3Insts = false; bool HasTanhInsts = false; + bool HasTensorCvtLutInsts = false; bool HasTransposeLoadF4F6Insts = false; bool HasPrngInst = false; bool HasBVHDualAndBVH8Insts = false; @@ -280,6 +281,7 @@ protected: bool RequiresCOV6 = false; bool UseBlockVGPROpsForCSR = false; + bool HasGloballyAddressableScratch = false; // Dummy feature to use for assembler in tablegen. bool FeatureDisable = false; @@ -1324,6 +1326,10 @@ public: bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; } + bool hasGloballyAddressableScratch() const { + return HasGloballyAddressableScratch; + } + bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; } bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; } @@ -1411,6 +1417,8 @@ public: bool hasTanhInsts() const { return HasTanhInsts; } + bool hasTensorCvtLutInsts() const { return HasTensorCvtLutInsts; } + bool hasAddPC64Inst() const { return GFX1250Insts; } bool hasMinimum3Maximum3PKF16() const { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index 86d56855..4e4660c 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -33,8 +33,7 @@ public: AMDGPUAsmBackend(const Target &T) : MCAsmBackend(llvm::endianness::little) {} void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value) const override; @@ -129,9 +128,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, } void AMDGPUAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (Target.getSpecifier()) IsResolved = false; maybeAddReloc(F, Fixup, Target, Value, IsResolved); @@ -148,13 +146,13 @@ void AMDGPUAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, Value <<= Info.TargetOffset; unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); - uint32_t Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the bits from // the fixup value. for (unsigned i = 0; i != NumBytes; ++i) - Data[Offset + i] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff); + Data[i] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff); } std::optional<MCFixupKind> diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 42c4d8b..ee8683a 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1393,6 +1393,75 @@ void AMDGPUInstPrinter::printMatrixBFMT(const MCInst *MI, unsigned OpNo, printMatrixFMT(MI, OpNo, STI, O, 'b'); } +void AMDGPUInstPrinter::printMatrixScale(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O, char AorB) { + auto Imm = MI->getOperand(OpNo).getImm() & 1; + if (Imm == 0) + return; + + O << " matrix_" << AorB << "_scale:"; + switch (Imm) { + default: + O << Imm; + break; + case WMMA::MatrixScale::MATRIX_SCALE_ROW0: + O << "MATRIX_SCALE_ROW0"; + break; + case WMMA::MatrixScale::MATRIX_SCALE_ROW1: + O << "MATRIX_SCALE_ROW1"; + break; + } +} + +void AMDGPUInstPrinter::printMatrixAScale(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printMatrixScale(MI, OpNo, STI, O, 'a'); +} + +void AMDGPUInstPrinter::printMatrixBScale(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printMatrixScale(MI, OpNo, STI, O, 'b'); +} + +void AMDGPUInstPrinter::printMatrixScaleFmt(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O, char AorB) { + auto Imm = MI->getOperand(OpNo).getImm() & 3; + if (Imm == 0) + return; + + O << " matrix_" << AorB << "_scale_fmt:"; + switch (Imm) { + default: + O << Imm; + break; + case WMMA::MatrixScaleFmt::MATRIX_SCALE_FMT_E8: + O << "MATRIX_SCALE_FMT_E8"; + break; + case WMMA::MatrixScaleFmt::MATRIX_SCALE_FMT_E5M3: + O << "MATRIX_SCALE_FMT_E5M3"; + break; + case WMMA::MatrixScaleFmt::MATRIX_SCALE_FMT_E4M3: + O << "MATRIX_SCALE_FMT_E4M3"; + break; + } +} + +void AMDGPUInstPrinter::printMatrixAScaleFmt(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printMatrixScaleFmt(MI, OpNo, STI, O, 'a'); +} + +void AMDGPUInstPrinter::printMatrixBScaleFmt(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printMatrixScaleFmt(MI, OpNo, STI, O, 'b'); +} + void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index f6739b14..be32061c 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -140,6 +140,19 @@ private: const MCSubtargetInfo &STI, raw_ostream &O); void printMatrixBFMT(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printMatrixScale(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O, char AorB); + void printMatrixAScale(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMatrixBScale(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMatrixScaleFmt(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O, + char AorB); + void printMatrixAScaleFmt(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMatrixBScaleFmt(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printInterpSlot(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printInterpAttr(const MCInst *MI, unsigned OpNo, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index ffdac8b..fa0c95f 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -75,8 +75,9 @@ unsigned AMDGPUMCAsmInfo::getMaxInstLength(const MCSubtargetInfo *STI) const { if (STI->hasFeature(AMDGPU::FeatureNSAEncoding)) return 20; - // VOP3PX encoding. - if (STI->hasFeature(AMDGPU::FeatureGFX950Insts)) + // VOP3PX/VOP3PX2 encoding. + if (STI->hasFeature(AMDGPU::FeatureGFX950Insts) || + STI->hasFeature(AMDGPU::FeatureGFX1250Insts)) return 16; // 64-bit instruction with 32-bit literal. diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 43ca548..68302f0 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -872,14 +872,14 @@ void AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(AMDGPUMCKernelCodeT &Header) { void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) { - MCSymbolELF *Symbol = cast<MCSymbolELF>( + auto *Symbol = static_cast<MCSymbolELF *>( getStreamer().getContext().getOrCreateSymbol(SymbolName)); Symbol->setType(Type); } void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, Align Alignment) { - MCSymbolELF *SymbolELF = cast<MCSymbolELF>(Symbol); + auto *SymbolELF = static_cast<MCSymbolELF *>(Symbol); SymbolELF->setType(ELF::STT_OBJECT); if (!SymbolELF->isBindingSet()) @@ -974,9 +974,9 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor( auto &Streamer = getStreamer(); auto &Context = Streamer.getContext(); - MCSymbolELF *KernelCodeSymbol = cast<MCSymbolELF>( - Context.getOrCreateSymbol(Twine(KernelName))); - MCSymbolELF *KernelDescriptorSymbol = cast<MCSymbolELF>( + auto *KernelCodeSymbol = + static_cast<MCSymbolELF *>(Context.getOrCreateSymbol(Twine(KernelName))); + auto *KernelDescriptorSymbol = static_cast<MCSymbolELF *>( Context.getOrCreateSymbol(Twine(KernelName) + Twine(".kd"))); // Copy kernel descriptor symbol's binding, other and visibility from the diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index c564145..deadb7a 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -1018,6 +1018,17 @@ enum MatrixFMT : unsigned { MATRIX_FMT_BF6 = 3, MATRIX_FMT_FP4 = 4 }; + +enum MatrixScale : unsigned { + MATRIX_SCALE_ROW0 = 0, + MATRIX_SCALE_ROW1 = 1, +}; + +enum MatrixScaleFmt : unsigned { + MATRIX_SCALE_FMT_E8 = 0, + MATRIX_SCALE_FMT_E5M3 = 1, + MATRIX_SCALE_FMT_E4M3 = 2 +}; } // namespace WMMA namespace VOP3PEncoding { diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index e934152..0c653b1 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1169,11 +1169,18 @@ void SIFoldOperandsImpl::foldOperand( // Grab the use operands first SmallVector<MachineOperand *, 4> UsesToProcess( llvm::make_pointer_range(MRI->use_nodbg_operands(RegSeqDstReg))); - for (auto *RSUse : UsesToProcess) { + for (unsigned I = 0; I != UsesToProcess.size(); ++I) { + MachineOperand *RSUse = UsesToProcess[I]; MachineInstr *RSUseMI = RSUse->getParent(); unsigned OpNo = RSUseMI->getOperandNo(RSUse); if (SplatRC) { + if (RSUseMI->isCopy()) { + Register DstReg = RSUseMI->getOperand(0).getReg(); + append_range(UsesToProcess, + make_pointer_range(MRI->use_nodbg_operands(DstReg))); + continue; + } if (tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) { FoldableDef SplatDef(SplatVal, SplatRC); appendFoldCandidate(FoldList, RSUseMI, OpNo, SplatDef); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 4d67e4a..63826b7 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2098,10 +2098,17 @@ bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) { bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { - // Flat -> private/local is a simple truncate. - // Flat -> global is no-op - if (SrcAS == AMDGPUAS::FLAT_ADDRESS) + if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { + if (DestAS == AMDGPUAS::PRIVATE_ADDRESS && + Subtarget->hasGloballyAddressableScratch()) { + // Flat -> private requires subtracting src_flat_scratch_base_lo. + return false; + } + + // Flat -> private/local is a simple truncate. + // Flat -> global is no-op return true; + } const GCNTargetMachine &TM = static_cast<const GCNTargetMachine &>(getTargetMachine()); @@ -7650,6 +7657,9 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) ? AMDGPU::SRC_SHARED_BASE : AMDGPU::SRC_PRIVATE_BASE; + assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE || + !Subtarget->hasGloballyAddressableScratch()) && + "Cannot use src_private_base with globally addressable scratch!"); // Note: this feature (register) is broken. When used as a 32-bit operand, // it returns a wrong value (all zeroes?). The real value is in the upper 32 // bits. @@ -7760,6 +7770,18 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, DestAS == AMDGPUAS::PRIVATE_ADDRESS) { SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); + if (DestAS == AMDGPUAS::PRIVATE_ADDRESS && + Subtarget->hasGloballyAddressableScratch()) { + // flat -> private with globally addressable scratch: subtract + // src_flat_scratch_base_lo. + SDValue FlatScratchBaseLo( + DAG.getMachineNode( + AMDGPU::S_MOV_B32, SL, MVT::i32, + DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)), + 0); + Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo); + } + if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS)) return Ptr; @@ -7776,11 +7798,40 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, if (DestAS == AMDGPUAS::FLAT_ADDRESS) { if (SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS) { - - SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG); - SDValue CvtPtr = - DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); - CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr); + SDValue CvtPtr; + if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS && + Subtarget->hasGloballyAddressableScratch()) { + // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr + // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr + SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32); + SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32); + ThreadID = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, + DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32), + AllOnes, ThreadID); + if (Subtarget->isWave64()) + ThreadID = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, + DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32), + AllOnes, ThreadID); + SDValue ShAmt = DAG.getShiftAmountConstant( + 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL); + SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt); + CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi); + CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr); + // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full + // 64-bit hi:lo value. + SDValue FlatScratchBase = { + DAG.getMachineNode( + AMDGPU::S_MOV_B64, SL, MVT::i64, + DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)), + 0}; + CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase); + } else { + SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG); + CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); + CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr); + } if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS)) return CvtPtr; @@ -9424,15 +9475,29 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_is_shared: case Intrinsic::amdgcn_is_private: { SDLoc SL(Op); - unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) - ? AMDGPUAS::LOCAL_ADDRESS - : AMDGPUAS::PRIVATE_ADDRESS; - SDValue Aperture = getSegmentAperture(AS, SL, DAG); SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); - SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec, DAG.getConstant(1, SL, MVT::i32)); + + unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) + ? AMDGPUAS::LOCAL_ADDRESS + : AMDGPUAS::PRIVATE_ADDRESS; + if (AS == AMDGPUAS::PRIVATE_ADDRESS && + Subtarget->hasGloballyAddressableScratch()) { + SDValue FlatScratchBaseHi( + DAG.getMachineNode( + AMDGPU::S_MOV_B32, DL, MVT::i32, + DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)), + 0); + // Test bits 63..58 against the aperture address. + return DAG.getSetCC( + SL, MVT::i1, + DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi), + DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT); + } + + SDValue Aperture = getSegmentAperture(AS, SL, DAG); return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ); } case Intrinsic::amdgcn_perm: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index a3e20ba..c552f1a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -908,6 +908,32 @@ def SupportedRoundMode : TImmLeaf<i32, [{ Imm == (int)RoundingMode::TowardNegative; }]>; +def VOP3PModsNeg : SDNodeXForm<timm, [{ + unsigned Mods = SISrcMods::OP_SEL_1; + if (N->getZExtValue()) + Mods ^= SISrcMods::NEG; + return CurDAG->getTargetConstant(Mods, SDLoc(N), MVT::i32); +}]>; + +def VOP3PModsNegs : SDNodeXForm<timm, [{ + unsigned Mods = SISrcMods::OP_SEL_1; + if (N->getZExtValue()) + Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); + return CurDAG->getTargetConstant(Mods, SDLoc(N), MVT::i32); +}]>; + +def VOP3PModsNegAbs : SDNodeXForm<timm, [{ + unsigned Val = N->getZExtValue(); + unsigned Mods = SISrcMods::OP_SEL_1; // default: none + if (Val == 1) // neg + Mods ^= SISrcMods::NEG; + if (Val == 2) // abs + Mods ^= SISrcMods::ABS; + if (Val == 3) // neg and abs + Mods ^= (SISrcMods::NEG | SISrcMods::ABS); + return CurDAG->getTargetConstant(Mods, SDLoc(N), MVT::i32); +}]>; + class bitextract_imm<int bitnum> : SDNodeXForm<imm, [{ uint64_t Imm = N->getZExtValue(); unsigned Bit = (Imm >> }] # bitnum # [{ ) & 1; @@ -1310,6 +1336,12 @@ def bitop3_0 : DefaultOperand<BitOp3, 0>; def MatrixAFMT : CustomOperand<i32, 1, "MatrixAFMT">; def MatrixBFMT : CustomOperand<i32, 1, "MatrixBFMT">; +def MatrixAScale : CustomOperand<i32, 1, "MatrixAScale">; +def MatrixBScale : CustomOperand<i32, 1, "MatrixBScale">; + +def MatrixAScaleFmt : CustomOperand<i32, 1, "MatrixAScaleFmt">; +def MatrixBScaleFmt : CustomOperand<i32, 1, "MatrixBScaleFmt">; + def MatrixAReuse : NamedBitOperand<"matrix_a_reuse">; def MatrixBReuse : NamedBitOperand<"matrix_b_reuse">; @@ -1647,9 +1679,6 @@ def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">; def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">; def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">; -def VOP3PModsNeg : ComplexPattern<untyped, 1, "SelectVOP3PModsNeg">; -def VOP3PModsNegs : ComplexPattern<untyped, 1, "SelectVOP3PModsNegs">; // chfang: not use complex pattern? -def VOP3PModsNegAbs : ComplexPattern<untyped, 1, "SelectVOP3PModsNegAbs">; def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">; def WMMAModsF32NegAbs : ComplexPattern<untyped, 2, "SelectWMMAModsF32NegAbs">; @@ -1774,6 +1803,7 @@ class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> { !eq(VT.Size, 256) : VOPDstOperand<VReg_256>, !eq(VT.Size, 192) : VOPDstOperand<VReg_192>, !eq(VT.Size, 128) : VOPDstOperand<VReg_128>, + !eq(VT.Size, 96) : VOPDstOperand<VReg_96>, !eq(VT.Size, 64) : VOPDstOperand<VReg_64>, !eq(VT.Size, 32) : VOPDstOperand<VGPR_32>, !eq(VT.Size, 16) : op16, @@ -1924,6 +1954,7 @@ class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> { !eq(VT, v2f16) : VCSrc_v2f16, !eq(VT, v2bf16) : VCSrc_v2bf16, !eq(VT, f32) : VCSrc_f32, + !eq(VT, v2i32) : VCSrc_v2b32, 1 : VCSrc_b32); } @@ -2678,6 +2709,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> { field bit HasNeg = HasModifiers; field bit HasMatrixReuse = 0; field bit HasMatrixFMT = 0; + field bit HasMatrixScale = 0; + field bit HasMatrixReuse = 0; field bit HasSrc0Mods = HasModifiers; field bit HasSrc1Mods = !if(HasModifiers, !or(HasSrc1FloatMods, HasSrc1IntMods), 0); @@ -2935,6 +2968,9 @@ def VOP_V2BF16_F32_F32_I32 : VOPProfile <[v2bf16, f32, f32, i32]>; def VOP_V2F16_F32_F32_I32 : VOPProfile <[v2f16, f32, f32, i32]>; def VOP_V6I32_V32F16_F32 : VOPProfile<[v6i32, v32f16, f32, untyped]>; def VOP_V6I32_V32BF16_F32 : VOPProfile<[v6i32, v32bf16, f32, untyped]>; +def VOP_V3I32_V16F16_F32 : VOPProfile<[v3i32, v16f16, f32, untyped]>; +def VOP_V3I32_V16BF16_F32 : VOPProfile<[v3i32, v16bf16, f32, untyped]>; +def VOP_V3I32_V16F32_F32 : VOPProfile<[v3i32, v16f32, f32, untyped]>; def VOP_V6I32_V16F32_V16F32_F32 : VOPProfile<[v6i32, v16f32, v16f32, f32]>; def VOP_V2F16_I32_F32 : VOPProfile<[v2f16, i32, f32, untyped]>; def VOP_V2I16_F32_F32_F32 : VOPProfile<[v2i16, f32, f32, f32]>; @@ -2948,6 +2984,8 @@ def VOP_BF16_F32_I32 : VOPProfile<[bf16, f32, i32, untyped]>; def VOP_F16_F32_I32 : VOPProfile<[f16, f32, i32, untyped]>; def VOP_I32_BF16_I32_F32 : VOPProfile<[i32, bf16, i32, f32]>; def VOP_I32_F16_I32_F32 : VOPProfile<[i32, f16, i32, f32]>; +def VOP_V16F16_V3I32_I32 : VOPProfile<[v16f16, v3i32, i32, untyped]>; +def VOP_V16BF16_V3I32_I32 : VOPProfile<[v16bf16, v3i32, i32, untyped]>; def VOP_V8F16_V2I32_I32 : VOPProfile<[v8f16, v2i32, i32, untyped]>; def VOP_V8BF16_V2I32_I32 : VOPProfile<[v8bf16, v2i32, i32, untyped]>; def VOP_V8F16_I32_I32 : VOPProfile<[v8f16, i32, i32, untyped]>; @@ -2955,11 +2993,26 @@ def VOP_V8BF16_I32_I32 : VOPProfile<[v8bf16, i32, i32, untyped]>; def VOP_V16F32_V3I32_I32 : VOPProfile<[v16f32, v3i32, i32, untyped]>; def VOP_V8F32_V2I32_I32 : VOPProfile<[v8f32, v2i32, i32, untyped]>; def VOP_V8F32_I32_I32 : VOPProfile<[v8f32, i32, i32, untyped]>; +def VOP_V2I32_V8BF16_F32 : VOPProfile<[v2i32, v8bf16, f32, untyped]>; +def VOP_V2I32_V8F16_F32 : VOPProfile<[v2i32, v8f16, f32, untyped]>; +def VOP_V2I32_V8F32_F32 : VOPProfile<[v2i32, v8f32, f32, untyped]>; +def VOP_I32_V8F32_F32 : VOPProfile<[i32, v8f32, f32, untyped]>; +def VOP_I32_V8F16_F32 : VOPProfile<[i32, v8f16, f32, untyped]>; +def VOP_I32_V8BF16_F32 : VOPProfile<[i32, v8bf16, f32, untyped]>; def VOP_I32_F32_I32_F32 : VOPProfile<[i32, f32, i32, f32]>; def VOP_V6I32_V32BF16_I32_F32 : VOPProfile<[v6i32, v32bf16, i32, f32]>; def VOP_V6I32_V32F16_I32_F32 : VOPProfile<[v6i32, v32f16, i32, f32]>; def VOP_V6I32_V32F32_I32_F32 : VOPProfile<[v6i32, v32f32, i32, f32]>; +def VOP_V3I32_V16F16_I32_F32 : VOPProfile<[v3i32, v16f16, i32, f32]>; +def VOP_V3I32_V16BF16_I32_F32 : VOPProfile<[v3i32, v16bf16, i32, f32]>; +def VOP_V3I32_V16F32_I32_F32 : VOPProfile<[v3i32, v16f32, i32, f32]>; +def VOP_V2I32_V8BF16_I32_F32 : VOPProfile<[v2i32, v8bf16, i32, f32]>; +def VOP_V2I32_V8F16_I32_F32 : VOPProfile<[v2i32, v8f16, i32, f32]>; +def VOP_V2I32_V8F32_I32_F32 : VOPProfile<[v2i32, v8f32, i32, f32]>; +def VOP_I32_V8F32_I32_F32 : VOPProfile<[i32, v8f32, i32, f32]>; +def VOP_I32_V8F16_I32_F32 : VOPProfile<[i32, v8f16, i32, f32]>; +def VOP_I32_V8BF16_I32_F32 : VOPProfile<[i32, v8bf16, i32, f32]>; def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 54fa192..bd5dfa9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3543,14 +3543,21 @@ def : GCNPat < (vecTy (UniformBinFrag<build_vector> (Ty undef), (Ty SReg_32:$src1))), (S_LSHL_B32 SReg_32:$src1, (i32 16)) >; -} def : GCNPat < (vecTy (DivergentBinFrag<build_vector> (Ty undef), (Ty VGPR_32:$src1))), (vecTy (V_LSHLREV_B32_e64 (i32 16), VGPR_32:$src1)) >; +} // End True16Predicate = ... } // End foreach Ty = ... -} +} // End AddedComplexity = 1 + +let True16Predicate = UseRealTrue16Insts in +def : GCNPat < + (v2i16 (DivergentBinFrag<build_vector> (i16 undef), (i16 (trunc i32:$src1)))), + (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, + (i16 (EXTRACT_SUBREG VGPR_32:$src1, lo16)), hi16) +>; let SubtargetPredicate = HasVOP3PInsts in { foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in @@ -3599,7 +3606,11 @@ def : GCNPat < >; def : GCNPat < (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$src0), (Ty undef))), - (REG_SEQUENCE VGPR_32, $src0, lo16, (IMPLICIT_DEF), hi16) + (REG_SEQUENCE VGPR_32, $src0, lo16, (Ty (IMPLICIT_DEF)), hi16) +>; +def : GCNPat < + (vecTy (DivergentBinFrag<build_vector> (Ty undef), (Ty VGPR_16:$src1))), + (REG_SEQUENCE VGPR_32, (Ty (IMPLICIT_DEF)), lo16, (Ty VGPR_16:$src1), hi16) >; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index f3acc5c..ae0f304 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -598,6 +598,8 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); + reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_LO); + reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_HI); // Reserve async counters pseudo registers reserveRegisterTuples(Reserved, AMDGPU::ASYNCcnt); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 36d1a3b..81655f5 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -246,6 +246,22 @@ defm SRC_SHARED_LIMIT : ApertureRegister<"src_shared_limit", 236>; defm SRC_PRIVATE_BASE : ApertureRegister<"src_private_base", 237>; defm SRC_PRIVATE_LIMIT : ApertureRegister<"src_private_limit", 238>; +let isConstant = true in { + defm SRC_FLAT_SCRATCH_BASE_LO : SIRegLoHi16<"src_flat_scratch_base_lo", 230>; + defm SRC_FLAT_SCRATCH_BASE_HI : SIRegLoHi16<"src_flat_scratch_base_hi", 231>; + + // Using src_flat_scratch_base_lo in a 64-bit context gets the full 64-bit + // hi:lo value. + def SRC_FLAT_SCRATCH_BASE : + RegisterWithSubRegs<"src_flat_scratch_base_lo", + [SRC_FLAT_SCRATCH_BASE_LO, + SRC_FLAT_SCRATCH_BASE_HI]> { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = SRC_FLAT_SCRATCH_BASE_LO.HWEncoding; + } +} + defm SRC_POPS_EXITING_WAVE_ID : SIRegLoHi16<"src_pops_exiting_wave_id", 239>; // Not addressable @@ -765,7 +781,7 @@ def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE_LO, SRC_SHARED_LIMIT_LO, SRC_PRIVATE_BASE_LO, SRC_PRIVATE_LIMIT_LO, SRC_SHARED_BASE_HI, SRC_SHARED_LIMIT_HI, SRC_PRIVATE_BASE_HI, SRC_PRIVATE_LIMIT_HI, SRC_POPS_EXITING_WAVE_ID, - SRC_VCCZ, SRC_EXECZ, SRC_SCC)> { + SRC_VCCZ, SRC_EXECZ, SRC_SCC, SRC_FLAT_SCRATCH_BASE_LO, SRC_FLAT_SCRATCH_BASE_HI)> { let AllocationPriority = 0; } @@ -776,7 +792,8 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, SRC_SHARED_LIMIT_LO_LO16, SRC_PRIVATE_BASE_LO_LO16, SRC_PRIVATE_LIMIT_LO_LO16, SRC_SHARED_BASE_HI_LO16, SRC_SHARED_LIMIT_HI_LO16, SRC_PRIVATE_BASE_HI_LO16, SRC_PRIVATE_LIMIT_HI_LO16, SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16, - SRC_EXECZ_LO16, SRC_SCC_LO16, EXEC_LO_LO16, EXEC_HI_LO16, M0_CLASS_LO16)> { + SRC_EXECZ_LO16, SRC_SCC_LO16, EXEC_LO_LO16, EXEC_HI_LO16, M0_CLASS_LO16, + SRC_FLAT_SCRATCH_BASE_LO_LO16, SRC_FLAT_SCRATCH_BASE_HI_LO16)> { let Size = 16; let isAllocatable = 0; let BaseClassOrder = 16; @@ -849,7 +866,8 @@ def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16, v4bf16], def SReg_64_XEXEC_XNULL : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16], 32, (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SRC_SHARED_BASE, - SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA)> { + SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA, + SRC_FLAT_SCRATCH_BASE)> { let CopyCost = 1; let AllocationPriority = 1; let HasSGPR = 1; @@ -1302,6 +1320,7 @@ def VCSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_FP64">; def VCSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2INT16">; def VCSrc_v2bf16: SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2BF16">; def VCSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2FP16">; +def VCSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_V2INT32">; // True 16 Operands def VCSrcT_b16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_INT16">; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 65fa088..00dcb9b 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -2654,6 +2654,8 @@ bool isInlineValue(unsigned Reg) { case AMDGPU::SRC_PRIVATE_BASE: case AMDGPU::SRC_PRIVATE_LIMIT_LO: case AMDGPU::SRC_PRIVATE_LIMIT: + case AMDGPU::SRC_FLAT_SCRATCH_BASE_LO: + case AMDGPU::SRC_FLAT_SCRATCH_BASE_HI: case AMDGPU::SRC_POPS_EXITING_WAVE_ID: return true; case AMDGPU::SRC_VCCZ: diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index f621f85..b128207 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -107,18 +107,6 @@ class VOP1_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : VOP_DPP_Pseudo <OpName, P, pattern> { } -class getVOP1Pat <SDPatternOperator node, VOPProfile P> : LetDummies { - list<dag> ret = - !if(P.HasModifiers, - [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods P.Src0VT:$src0, i32:$src0_modifiers))))], - !if(P.HasOMod, - [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3OMods P.Src0VT:$src0, - i1:$clamp, i32:$omod))))], - [(set P.DstVT:$vdst, (node (P.Src0VT P.Src0RC32:$src0)))] - ) - ); -} - multiclass VOP1Inst <string opName, VOPProfile P, SDPatternOperator node = null_frag, int VOPDOp = -1> { // We only want to set this on the basic, non-SDWA or DPP forms. diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 19ce7f5..f4b6af6 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1726,6 +1726,12 @@ multiclass VOP3CvtScaleSelInst<string OpName, VOPProfile P, SDPatternOperator no } } +let HasExtVOP3DPP = 0, HasModifiers = 0 in { +def VOP3_V2I32_I32_I32_V2I32 : VOP3_Profile<VOPProfile<[v2i32, i32, i32, v2i32]>>; +def VOP3_V3I32_I32_I64_V2I32 : VOP3_Profile<VOPProfile<[v3i32, i32, i64, v2i32]>>; +def VOP3_V4I32_I64_I64_V2I32 : VOP3_Profile<VOPProfile<[v4i32, i64, i64, v2i32]>>; +} + let Src0RC64 = VSrc_NoInline_v2f16 in { def VOP3_CVT_PK_F8_F16_Profile : VOP3_Profile<VOP_I16_V2F16>; def VOP3_CVT_PK_F8_F16_True16_Profile : VOP3_Profile_True16<VOP3_CVT_PK_F8_F16_Profile>; @@ -1771,6 +1777,12 @@ let SubtargetPredicate = isGFX1250Plus in { defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_bf8", VOP_V8BF16_V2I32_I32, int_amdgcn_cvt_scale_pk8_bf16_bf8>; defm V_CVT_SCALE_PK8_F32_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp8>; defm V_CVT_SCALE_PK8_F32_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_bf8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_bf8>; + defm V_CVT_SCALE_PK16_F16_FP6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f16_fp6", VOP_V16F16_V3I32_I32, int_amdgcn_cvt_scale_pk16_f16_fp6>; + defm V_CVT_SCALE_PK16_BF16_FP6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_bf16_fp6", VOP_V16BF16_V3I32_I32, int_amdgcn_cvt_scale_pk16_bf16_fp6>; + defm V_CVT_SCALE_PK16_F16_BF6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f16_bf6", VOP_V16F16_V3I32_I32, int_amdgcn_cvt_scale_pk16_f16_bf6>; + defm V_CVT_SCALE_PK16_BF16_BF6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_bf16_bf6", VOP_V16BF16_V3I32_I32, int_amdgcn_cvt_scale_pk16_bf16_bf6>; + defm V_CVT_SCALE_PK16_F32_FP6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f32_fp6", VOP_V16F32_V3I32_I32, int_amdgcn_cvt_scale_pk16_f32_fp6>; + defm V_CVT_SCALE_PK16_F32_BF6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f32_bf6", VOP_V16F32_V3I32_I32, int_amdgcn_cvt_scale_pk16_f32_bf6>; } // End Constraints = "@earlyclobber $vdst" defm V_CVT_SCALE_PK8_F16_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_fp4", VOP_V8F16_I32_I32, int_amdgcn_cvt_scale_pk8_f16_fp4>; @@ -1778,6 +1790,44 @@ let SubtargetPredicate = isGFX1250Plus in { defm V_CVT_SCALE_PK8_F32_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp4", VOP_V8F32_I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp4>; } // End ReadsModeReg = 0 + let Constraints = "@earlyclobber $vdst" in { + let WaveSizePredicate = isWave32 in { + defm V_CVT_SCALEF32_PK8_FP8_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_fp8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_fp8_bf16>; + defm V_CVT_SCALEF32_PK8_BF8_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_bf8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_bf8_bf16>; + defm V_CVT_SCALEF32_PK8_FP8_F16 : VOP3Inst<"v_cvt_scalef32_pk8_fp8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_fp8_f16>; + defm V_CVT_SCALEF32_PK8_BF8_F16 : VOP3Inst<"v_cvt_scalef32_pk8_bf8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_bf8_f16>; + defm V_CVT_SCALEF32_PK8_FP8_F32 : VOP3Inst<"v_cvt_scalef32_pk8_fp8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_F32>, int_amdgcn_cvt_scalef32_pk8_fp8_f32>; + defm V_CVT_SCALEF32_PK8_BF8_F32 : VOP3Inst<"v_cvt_scalef32_pk8_bf8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_F32>, int_amdgcn_cvt_scalef32_pk8_bf8_f32>; + defm V_CVT_SCALEF32_PK8_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F32_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_f32>; + defm V_CVT_SCALEF32_PK8_FP4_F16 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_f16>; + defm V_CVT_SCALEF32_PK8_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_bf16>; + } // End WaveSizePredicate = isWave32 + defm V_CVT_SCALEF32_PK16_FP6_F32 : VOP3Inst<"v_cvt_scalef32_pk16_fp6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_F32>, int_amdgcn_cvt_scalef32_pk16_fp6_f32>; + defm V_CVT_SCALEF32_PK16_BF6_F32 : VOP3Inst<"v_cvt_scalef32_pk16_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_F32>, int_amdgcn_cvt_scalef32_pk16_bf6_f32>; + defm V_CVT_SCALEF32_PK16_FP6_F16 : VOP3Inst<"v_cvt_scalef32_pk16_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_F32>, int_amdgcn_cvt_scalef32_pk16_fp6_f16>; + defm V_CVT_SCALEF32_PK16_BF6_F16 : VOP3Inst<"v_cvt_scalef32_pk16_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_F32>, int_amdgcn_cvt_scalef32_pk16_bf6_f16>; + defm V_CVT_SCALEF32_PK16_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_pk16_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_F32>, int_amdgcn_cvt_scalef32_pk16_fp6_bf16>; + defm V_CVT_SCALEF32_PK16_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_pk16_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_F32>, int_amdgcn_cvt_scalef32_pk16_bf6_bf16>; + + let WaveSizePredicate = isWave32 in { + defm V_CVT_SCALEF32_SR_PK8_FP8_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16>; + defm V_CVT_SCALEF32_SR_PK8_BF8_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_bf8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16>; + defm V_CVT_SCALEF32_SR_PK8_FP8_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp8_f16>; + defm V_CVT_SCALEF32_SR_PK8_BF8_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_bf8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_bf8_f16>; + defm V_CVT_SCALEF32_SR_PK8_FP8_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp8_f32>; + defm V_CVT_SCALEF32_SR_PK8_BF8_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk8_bf8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_bf8_f32>; + defm V_CVT_SCALEF32_SR_PK8_FP4_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_f32>; + defm V_CVT_SCALEF32_SR_PK8_FP4_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_f16>; + defm V_CVT_SCALEF32_SR_PK8_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16>; + } // End WaveSizePredicate = isWave32 + defm V_CVT_SCALEF32_SR_PK16_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16>; + defm V_CVT_SCALEF32_SR_PK16_BF6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_f16>; + defm V_CVT_SCALEF32_SR_PK16_BF6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_f32>; + defm V_CVT_SCALEF32_SR_PK16_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16>; + defm V_CVT_SCALEF32_SR_PK16_FP6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_f16>; + defm V_CVT_SCALEF32_SR_PK16_FP6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_f32>; + } // End Constraints = "@earlyclobber $vdst" + let True16Predicate = UseRealTrue16Insts in { def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f16, V_CVT_SR_FP8_F16_t16_e64, f16>; def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f16, V_CVT_SR_BF8_F16_t16_e64, f16>; @@ -1788,6 +1838,12 @@ let SubtargetPredicate = isGFX1250Plus in { } } // End SubtargetPredicate = isGFX1250Plus +let SubtargetPredicate = HasTensorCvtLutInsts in { + defm V_PERM_PK16_B4_U4 : VOP3Inst<"v_perm_pk16_b4_u4", VOP3_V2I32_I32_I32_V2I32, int_amdgcn_perm_pk16_b4_u4>; + defm V_PERM_PK16_B6_U4 : VOP3Inst<"v_perm_pk16_b6_u4", VOP3_V3I32_I32_I64_V2I32, int_amdgcn_perm_pk16_b6_u4>; + defm V_PERM_PK16_B8_U4 : VOP3Inst<"v_perm_pk16_b8_u4", VOP3_V4I32_I64_I64_V2I32, int_amdgcn_perm_pk16_b8_u4>; +} // End SubtargetPredicate = HasTensorCvtLutInsts + class Cvt_Scale_Sr_F32ToBF16F16_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType DstTy> : GCNPat< (DstTy (node DstTy:$vdst_in, f32:$src0, i32:$src1, timm:$word_sel)), (inst (DstSelToOpSelXForm $word_sel), $src0, 0, $src1, VGPR_32:$vdst_in) @@ -2186,6 +2242,9 @@ let AssemblerPredicate = isGFX11Plus in { } // These instructions differ from GFX12 variant by supporting DPP: +defm V_PERM_PK16_B4_U4 : VOP3Only_Real_Base_gfx1250<0x23f>; +defm V_PERM_PK16_B6_U4 : VOP3Only_Real_Base_gfx1250<0x242>; +defm V_PERM_PK16_B8_U4 : VOP3Only_Real_Base_gfx1250<0x243>; defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250<0x252>; defm V_ASHR_PK_I8_I32 : VOP3Only_Realtriple_gfx1250<0x290>; defm V_ASHR_PK_U8_I32 : VOP3Only_Realtriple_gfx1250<0x291>; @@ -2198,6 +2257,42 @@ defm V_CVT_SCALE_PK8_F32_FP8 : VOP3Only_ScaleSel_Real_gfx1250<0x2aa>; defm V_CVT_SCALE_PK8_F16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ab>; defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ac>; defm V_CVT_SCALE_PK8_F32_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ad>; +defm V_CVT_SCALEF32_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x2b0>; +defm V_CVT_SCALEF32_PK8_FP4_F16 : VOP3Only_Real_Base_gfx1250<0x2b3>; +defm V_CVT_SCALEF32_PK8_FP8_BF16 : VOP3Only_Real_Base_gfx1250<0x2b4>; +defm V_CVT_SCALEF32_PK8_BF8_BF16 : VOP3Only_Real_Base_gfx1250<0x2b5>; +defm V_CVT_SCALEF32_PK8_FP4_BF16 : VOP3Only_Real_Base_gfx1250<0x2b8>; +defm V_CVT_SCALEF32_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x2c3>; +defm V_CVT_SCALEF32_PK8_FP8_F16 : VOP3Only_Real_Base_gfx1250<0x2c4>; +defm V_CVT_SCALEF32_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x2c5>; +defm V_CVT_SCALEF32_PK8_BF8_F16 : VOP3Only_Real_Base_gfx1250<0x2c6>; +defm V_CVT_SCALE_PK16_F16_FP6 : VOP3Only_ScaleSel_Real_gfx1250<0x2c7>; +defm V_CVT_SCALE_PK16_BF16_FP6 : VOP3Only_ScaleSel_Real_gfx1250<0x2c8>; +defm V_CVT_SCALE_PK16_F32_FP6 : VOP3Only_ScaleSel_Real_gfx1250<0x2c9>; +defm V_CVT_SCALE_PK16_F16_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2ca>; +defm V_CVT_SCALE_PK16_BF16_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2cb>; +defm V_CVT_SCALE_PK16_F32_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2cc>; +defm V_CVT_SCALEF32_PK16_FP6_F32 : VOP3Only_Real_Base_gfx1250<0x2cd>; +defm V_CVT_SCALEF32_PK16_BF6_F32 : VOP3Only_Real_Base_gfx1250<0x2ce>; +defm V_CVT_SCALEF32_PK16_FP6_F16 : VOP3Only_Real_Base_gfx1250<0x2cf>; +defm V_CVT_SCALEF32_PK16_BF6_F16 : VOP3Only_Real_Base_gfx1250<0x2d0>; +defm V_CVT_SCALEF32_PK16_FP6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d1>; +defm V_CVT_SCALEF32_PK16_BF6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d2>; +defm V_CVT_SCALEF32_SR_PK16_FP6_F32 : VOP3Only_Real_Base_gfx1250<0x2d3>; +defm V_CVT_SCALEF32_SR_PK16_BF6_F32 : VOP3Only_Real_Base_gfx1250<0x2d4>; +defm V_CVT_SCALEF32_SR_PK16_FP6_F16 : VOP3Only_Real_Base_gfx1250<0x2d5>; +defm V_CVT_SCALEF32_SR_PK16_BF6_F16 : VOP3Only_Real_Base_gfx1250<0x2d6>; +defm V_CVT_SCALEF32_SR_PK16_FP6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d7>; +defm V_CVT_SCALEF32_SR_PK16_BF6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d8>; +defm V_CVT_SCALEF32_SR_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x297>; +defm V_CVT_SCALEF32_SR_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x298>; +defm V_CVT_SCALEF32_SR_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x299>; +defm V_CVT_SCALEF32_SR_PK8_FP4_F16 : VOP3Only_Real_Base_gfx1250<0x2b9>; +defm V_CVT_SCALEF32_SR_PK8_FP4_BF16 : VOP3Only_Real_Base_gfx1250<0x2bc>; +defm V_CVT_SCALEF32_SR_PK8_FP8_F16 : VOP3Only_Real_Base_gfx1250<0x2bf>; +defm V_CVT_SCALEF32_SR_PK8_FP8_BF16 : VOP3Only_Real_Base_gfx1250<0x2c0>; +defm V_CVT_SCALEF32_SR_PK8_BF8_F16 : VOP3Only_Real_Base_gfx1250<0x2c1>; +defm V_CVT_SCALEF32_SR_PK8_BF8_BF16 : VOP3Only_Real_Base_gfx1250<0x2c2>; defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36d>; defm V_CVT_SR_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36e>; defm V_CVT_PK_F16_F32 : VOP3Only_Realtriple_gfx1250<0x36f>; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 95fcd4a..ce280d4 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -557,11 +557,11 @@ multiclass VOP3PDOTIUInst <string OpName, SDPatternOperator intrinsic_node> { null_frag, 1>; // Dot-iu instructions consider input as signed if imod neg bits are set. Thus // Dot-iu Intrinsics have extra operands and require separate codegen pattern. - def : GCNPat < (intrinsic_node (VOP3PModsNeg i32:$src0_mods), i32:$src0, - (VOP3PModsNeg i32:$src1_mods), i32:$src1, + def : GCNPat < (intrinsic_node timm:$src0_mods, i32:$src0, + timm:$src1_mods, i32:$src1, i32:$src2, (i1 timm:$clamp)), - (!cast<Instruction>(NAME) $src0_mods, i32:$src0, - $src1_mods, i32:$src1, + (!cast<Instruction>(NAME) (VOP3PModsNeg $src0_mods), i32:$src0, + (VOP3PModsNeg $src1_mods), i32:$src1, (i32 8), i32:$src2, i1:$clamp) >; } @@ -1302,11 +1302,11 @@ class WMMAOpSelPat<Instruction Inst, SDPatternOperator node, VOPProfile P> : class WMMAUIClampPat<Instruction Inst, SDPatternOperator node, VOPProfile P> : GCNPat < (P.DstVT (node - (VOP3PModsNeg i32:$src0_modifiers), (P.Src0VT P.Src0VT:$src0), - (VOP3PModsNeg i32:$src1_modifiers), (P.Src1VT P.Src1VT:$src1), + timm:$src0_modifiers, (P.Src0VT P.Src0VT:$src0), + timm:$src1_modifiers, (P.Src1VT P.Src1VT:$src1), (P.Src2VT P.Src2VT:$src2), (i1 timm:$clamp) )), - (P.DstVT (Inst i32:$src0_modifiers, P.Src0VT:$src0, i32:$src1_modifiers, P.Src1VT:$src1, (i32 8), P.Src2VT:$src2, i1:$clamp)) + (P.DstVT (Inst (VOP3PModsNeg $src0_modifiers), P.Src0VT:$src0, (VOP3PModsNeg $src1_modifiers), P.Src1VT:$src1, (i32 8), P.Src2VT:$src2, i1:$clamp)) >; class WMMAOpcodeMapping<Instruction TwoAddr, Instruction ThreeAddr> { @@ -1407,9 +1407,9 @@ let WaveSizePredicate = isWave64 in { } class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, - bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0, - bit _HasMatrixFMT = 0, bit _HasMatrixReuse = 0, - bit _IsF4 = 0> + bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0, + bit _HasMatrixFMT = 0, bit _HasMatrixScale = 0, + bit _Scale16 = 0, bit _HasMatrixReuse = 0, bit _IsF4 = 0> : VOP3P_Profile<VOPProfile<ArgTy>> { bit IsIU = _IsIU; bit NoABMods = !or(_IsFP8BF8XF32, _IsF4); // No IMOD support for A and B @@ -1417,6 +1417,8 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, int IndexType = _IndexType; let HasMatrixFMT = _HasMatrixFMT; + let HasMatrixScale = _HasMatrixScale; + bit Scale16 = _Scale16; let HasMatrixReuse = _HasMatrixReuse; bit HasIModOp = _Has_ImodOp; @@ -1455,6 +1457,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, IsC_F16: "_f16", IsC_BF16: "_bf16", 1: "_b32"))); + ValueType ScaleTy = !if(Scale16, i64, i32); // For f16 and bf16 matrices A and B, each element can be modified by // fneg(neg_lo,neg_hi = 1). For f32 and f64, neg_lo[0:1] is allowed, but @@ -1516,6 +1519,13 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, !eq(IndexType, 32): (ins IndexKey32bit:$index_key_32bit)); dag MatrixFMT = !if(HasMatrixFMT, (ins MatrixAFMT:$matrix_a_fmt, MatrixBFMT:$matrix_b_fmt), (ins)); + dag MatrixScaleSrc = !if(HasMatrixScale, + !if(Scale16, (ins VCSrc_b64:$scale_src0, VCSrc_b64:$scale_src1), + (ins VCSrc_b32:$scale_src0, VCSrc_b32:$scale_src1)), + (ins)); + dag MatrixScale = !if(HasMatrixScale, (ins MatrixAScale:$matrix_a_scale, MatrixBScale:$matrix_b_scale, + MatrixAScaleFmt:$matrix_a_scale_fmt, MatrixBScaleFmt:$matrix_b_scale_fmt), + (ins)); dag MatrixReuse = !if(HasMatrixReuse, (ins MatrixAReuse:$matrix_a_reuse, MatrixBReuse:$matrix_b_reuse), (ins)); dag Clamp = !if(HasClamp, (ins Clamp0:$clamp), (ins)); dag Neg = !cond(!and(NegLoAny, NegHiAny) : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi), @@ -1529,7 +1539,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, (ins VRegSrc_64:$src2), (ins VRegSrc_32:$src2)), IndexKey)), - MatrixFMT, MatrixReuse, Clamp, Neg); + MatrixScaleSrc, MatrixFMT, MatrixScale, MatrixReuse, Clamp, Neg); // asm @@ -1538,57 +1548,59 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, !eq(IndexType, 16) : "$index_key_16bit", !eq(IndexType, 32) : "$index_key_32bit"); string MatrxFMTAsm = !if(HasMatrixFMT, "$matrix_a_fmt$matrix_b_fmt", ""); + string MatrixScaleSrcAsm = !if(HasMatrixScale, ", $scale_src0, $scale_src1", ""); + string MatrixScaleAsm = !if(HasMatrixScale, "$matrix_a_scale$matrix_b_scale$matrix_a_scale_fmt$matrix_b_scale_fmt", ""); string MatrixReuseAsm = !if(HasMatrixReuse, "$matrix_a_reuse$matrix_b_reuse", ""); string ClampAsm = !if(HasClamp, "$clamp", ""); string NegAsm = !cond(!and(NegLoAny, NegHiAny) : "$neg_lo$neg_hi", !and(NegLoAny, !not(NegHiAny)) : "$neg_lo", !and(!not(NegLoAny), !not(NegHiAny)) : ""); - let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrxFMTAsm#MatrixReuseAsm#NegAsm#ClampAsm; + let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrixScaleSrcAsm#MatrxFMTAsm#MatrixScaleAsm#MatrixReuseAsm#NegAsm#ClampAsm; // isel patterns bit IsAB_BF16_IMod0 = !and(IsAB_BF16, !not(HasIModOp)); bit IsAB_F16_IMod0 = !and(IsAB_F16, !not(HasIModOp)); bit IsAB_F32F64_IMod1 = !and(!or(IsAB_F64, IsAB_F32), HasIModOp); bit IsAB_F16BF16_IMod1 = !and(!or(IsAB_F16, IsAB_BF16), HasIModOp); - dag Src0InPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0), - IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs i32:$src0_modifiers), Src0VT:$src0), + dag Src0InPat = !cond(IsAB_F32F64_IMod1 : (ins timm:$src0_modifiers, Src0VT:$src0), + IsAB_F16BF16_IMod1 : (ins timm:$src0_modifiers, Src0VT:$src0), IsAB_F16_IMod0 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))), IsAB_BF16_IMod0 : (ins Src0VT:$src0), - IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0), + IsIU : (ins timm:$src0_modifiers, Src0VT:$src0), HasMatrixFMT : (ins timm:$matrix_a_fmt, Src0VT:$src0), NoABMods : (ins Src0VT:$src0)); - dag Src0OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0), - IsAB_F16BF16_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0), + dag Src0OutPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg $src0_modifiers), Src0VT:$src0), + IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs $src0_modifiers), Src0VT:$src0), IsAB_F16_IMod0 : (ins i32:$src0_modifiers, Src0VT:$src0), IsAB_BF16_IMod0 : (ins (i32 8), Src0VT:$src0), - IsIU : (ins i32:$src0_modifiers, Src0VT:$src0), + IsIU : (ins (VOP3PModsNeg $src0_modifiers), Src0VT:$src0), NoABMods : (ins Src0VT:$src0)); - dag Src1InPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1), - IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs i32:$src1_modifiers), Src1VT:$src1), + dag Src1InPat = !cond(IsAB_F32F64_IMod1 : (ins timm:$src1_modifiers, Src1VT:$src1), + IsAB_F16BF16_IMod1 : (ins timm:$src1_modifiers, Src1VT:$src1), IsAB_F16_IMod0 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))), IsAB_BF16_IMod0 : (ins Src1VT:$src1), - IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1), + IsIU : (ins timm:$src1_modifiers, Src1VT:$src1), HasMatrixFMT : (ins timm:$matrix_b_fmt, Src1VT:$src1), NoABMods : (ins Src1VT:$src1)); - dag Src1OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1), - IsAB_F16BF16_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1), + dag Src1OutPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg $src1_modifiers), Src1VT:$src1), + IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs $src1_modifiers), Src1VT:$src1), IsAB_F16_IMod0 : (ins i32:$src1_modifiers, Src1VT:$src1), IsAB_BF16_IMod0 : (ins (i32 8), Src1VT:$src1), - IsIU : (ins i32:$src1_modifiers, Src1VT:$src1), + IsIU : (ins (VOP3PModsNeg $src1_modifiers), Src1VT:$src1), NoABMods : (ins Src1VT:$src1)); bit IsC_IMod1 = !and(HasIModOp, IsWMMA, !not(IsIU), !not(IsXF32)); bit IsC_F32_IMod0 = !and(IsC_F32, !not(HasIModOp)); bit IsC_F16_IMod0 = !and(IsC_F16, !not(HasIModOp)); bit IsC_BF16_IMod0 = !and(IsC_BF16, !not(HasIModOp)); bit IsIUXF32 = !or(IsIU, IsXF32); - dag Src2InPatWmma = !cond(IsC_IMod1 : (ins (VOP3PModsNegAbs i32:$src2_modifiers), Src2VT:$src2), + dag Src2InPatWmma = !cond(IsC_IMod1 : (ins timm:$src2_modifiers, Src2VT:$src2), IsC_F32_IMod0 : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))), IsC_F16_IMod0 : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))), IsC_BF16_IMod0 : (ins Src2VT:$src2), IsIUXF32 : (ins Src2VT:$src2), IsSWMMAC : (ins)); - dag Src2OutPatWmma = !cond(IsC_IMod1 : (ins i32:$src2_modifiers, Src2VT:$src2), + dag Src2OutPatWmma = !cond(IsC_IMod1 : (ins (VOP3PModsNegAbs $src2_modifiers), Src2VT:$src2), IsC_F32_IMod0 : (ins i32:$src2_modifiers, Src2VT:$src2), IsC_F16_IMod0 : (ins i32:$src2_modifiers, Src2VT:$src2), IsC_BF16_IMod0 : (ins (i32 8), Src2VT:$src2), @@ -1604,22 +1616,29 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, !eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit), !eq(IndexType, 32): (ins i64:$src2, i32:$index_key_32bit)); dag MatrixFMTOutPat = !if(HasMatrixFMT, (ins i32:$matrix_a_fmt, i32:$matrix_b_fmt), (ins)); - dag Src2InlineInPat = !con(!if(IsC_IMod1, (ins (VOP3PModsNegAbs i32:$src2_modifiers)), (ins)), (ins (Src2VT (WMMAVISrc Src2VT:$src2)))); - dag Src2InlineOutPat = !con(!if(IsIUXF32, (ins), !if(IsC_IMod1, (ins i32:$src2_modifiers), (ins (i32 8)))), (ins Src2VT:$src2)); + dag Src2InlineInPat = !con(!if(IsC_IMod1, (ins timm:$src2_modifiers), (ins)), (ins (Src2VT (WMMAVISrc Src2VT:$src2)))); + dag Src2InlineOutPat = !con(!if(IsIUXF32, (ins), !if(IsC_IMod1, (ins (VOP3PModsNegAbs $src2_modifiers)), (ins (i32 8)))), (ins Src2VT:$src2)); + dag MatrixScaleInPat = !if(HasMatrixScale, (ins timm:$matrix_a_scale, timm:$matrix_a_scale_fmt, ScaleTy:$scale_src0, + timm:$matrix_b_scale, timm:$matrix_b_scale_fmt, ScaleTy:$scale_src1), + (ins)); dag MatrixReuseInPat = !if(HasMatrixReuse, (ins timm:$matrix_a_reuse, timm:$matrix_b_reuse), (ins)); + dag MatrixScaleOutSrcPat = !if(HasMatrixScale, (ins ScaleTy:$scale_src0, ScaleTy:$scale_src1), (ins)); + dag MatrixScaleOutModPat = !if(HasMatrixScale, (ins i32:$matrix_a_scale, i32:$matrix_b_scale, i32:$matrix_a_scale_fmt, i32:$matrix_b_scale_fmt), (ins)); dag MatrixReuseOutModPat = !if(HasMatrixReuse, (ins i1:$matrix_a_reuse, i1:$matrix_b_reuse), (ins)); - dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixReuseInPat, ClampPat); - dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixFMTOutPat, MatrixReuseOutModPat, ClampPat); + dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixScaleInPat, MatrixReuseInPat, ClampPat); + dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixScaleOutSrcPat, MatrixFMTOutPat, + MatrixScaleOutModPat, MatrixReuseOutModPat, ClampPat); dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, MatrixReuseInPat, ClampPat); dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, MatrixReuseOutModPat, ClampPat); // wmma pattern where src2 is inline imm uses _threeaddr pseudo, // can't use _twoaddr since it would violate src2 tied to vdst constraint. - dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, MatrixReuseInPat, ClampPat); - dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixFMTOutPat, MatrixReuseOutModPat, ClampPat); + dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, MatrixScaleInPat, MatrixReuseInPat, ClampPat); + dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixScaleOutSrcPat, + MatrixFMTOutPat, MatrixScaleOutModPat, MatrixReuseOutModPat, ClampPat); } def WMMAInstInfoTable : GenericTable { @@ -1645,11 +1664,15 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo { let PseudoInstr = Instr#PseudoInstrSuffix; + let FixedSize = WMMAProfile.HasMatrixScale; + let Size = !if(WMMAProfile.HasMatrixScale, 16, 8); } let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in def _threeaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo { let PseudoInstr = Instr#PseudoInstrSuffix; + let FixedSize = WMMAProfile.HasMatrixScale; + let Size = !if(WMMAProfile.HasMatrixScale, 16, 8); } } @@ -1728,39 +1751,55 @@ def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1, // *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored // for matrix A, index is i16; Matrix B uses all lanes -def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 0, 1>; -def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 1>; -def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 0, 1>; -def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 0, 1>; -def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 0, 1>; -def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 1>; -def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 0, 1>; -def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 0, 1>; -def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 0, 1>; -def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 0, 1>; -def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 0, 1>; -def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 1>; -def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 0, 1>; -def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 0, 1>; -def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 0, 1>; -def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 0, 1>; -def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 0, 1>; -def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 0, 1>; -def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 0, 1>; - -multiclass WMMA_F8F6F4_Profiles<bit HasMatrixReuse> { - def _f8_f8_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f8_f6_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f8_f4_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f6_f8_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f6_f6_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f6_f4_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f4_f8_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f4_f6_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; - def _f4_f4_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>; -} - -defm F32_16X16X128_F8F6F4 : WMMA_F8F6F4_Profiles<0>; +def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>; +def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>; +def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>; +def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 0, 0, 0, 1>; +def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 0, 0, 0, 1>; +def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>; +def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 0, 0, 0, 1>; +def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 0, 0, 0, 1>; +def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 0, 0, 0, 1>; +def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 0, 0, 0, 1>; +def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 0, 0, 0, 1>; +def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 0, 0, 1>; +def F32_32X16X128_F4_SCALE_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 1, 1, 0, 1, 0, 1>; +def F32_32X16X128_F4_SCALE16_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 1, 1, 0, 1, 1, 1>; +def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>; +def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>; +def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 0, 0, 0, 1>; +def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 0, 0, 0, 1>; +def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 0, 0, 0, 1>; +def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 0, 0, 0, 1>; +def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 0, 0, 0, 1>; + +multiclass WMMA_F8F6F4_Profiles<bit HasMatrixScale, bit Scale16, bit HasMatrixReuse> { + def _f8_f8_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f8_f6_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f8_f4_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f6_f8_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f6_f6_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f6_f4_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f4_f8_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f4_f6_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + def _f4_f4_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; +} + +defm F32_16X16X128_F8F6F4 : WMMA_F8F6F4_Profiles<0, 0, 0>; +defm F32_16X16X128_F8F6F4_SCALE : WMMA_F8F6F4_Profiles<1, 0, 1>; +defm F32_16X16X128_F8F6F4_SCALE16 : WMMA_F8F6F4_Profiles<1, 1, 1>; + +class VOP_WMMA_LD_SCALE<ValueType vt, RegisterOperand RC> : VOP3P_Profile<VOPProfile<[untyped, vt, vt, untyped]>> { + let HasMatrixScale = 1; + let HasMatrixReuse = 1; + let HasNeg = 0; + let Src0RC64 = RC; + let Src1RC64 = RC; + let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, MatrixAScale:$matrix_a_scale, MatrixBScale:$matrix_b_scale, + MatrixAScaleFmt:$matrix_a_scale_fmt, MatrixBScaleFmt:$matrix_b_scale_fmt, + MatrixAReuse:$matrix_a_reuse, MatrixBReuse:$matrix_b_reuse); + let AsmVOP3P = " $src0, $src1$matrix_a_scale$matrix_b_scale$matrix_a_scale_fmt$matrix_b_scale_fmt$matrix_a_reuse$matrix_b_reuse"; +} multiclass WMMAInst_SrcFormats_mc<string OpName, string Profile> { foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in { @@ -1813,9 +1852,15 @@ defm V_SWMMAC_F32_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x64 defm V_SWMMAC_F16_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x64_f16", F16_F16X64_SWMMAC_w32, "_w32">; defm V_WMMA_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4">; +defm V_WMMA_SCALE_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_scale_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4_SCALE">; +defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_scale16_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4_SCALE16">; +defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale_f32_32x16x128_f4", F32_32X16X128_F4_SCALE_w32, "_w32">; +defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">; } // End is_wmma_xdl = 1. +defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32>>; +defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64>>; } // End SubtargetPredicate = isGFX125xOnly } // End WaveSizePredicate = isWave32 @@ -1970,9 +2015,13 @@ let SubtargetPredicate = isGFX125xOnly in { defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_fp8, F32_FP8BF8X128_WMMA_w32>; defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_bf8, F32_FP8BF8X128_WMMA_w32>; defm : WMMAPat<"V_WMMA_F32_32X16X128_F4_w32", int_amdgcn_wmma_f32_32x16x128_f4, F32_32X16X128_F4_WMMA_w32>; + defm : WMMAPat<"V_WMMA_SCALE_F32_32X16X128_F4_w32", int_amdgcn_wmma_scale_f32_32x16x128_f4, F32_32X16X128_F4_SCALE_w32>; + defm : WMMAPat<"V_WMMA_SCALE16_F32_32X16X128_F4_w32", int_amdgcn_wmma_scale16_f32_32x16x128_f4, F32_32X16X128_F4_SCALE16_w32>; foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in { defm : WMMAPat<"V_WMMA_F32_16X16X128_F8F6F4_" # I # "_w32", int_amdgcn_wmma_f32_16x16x128_f8f6f4, !cast<VOP3PWMMA_Profile>("F32_16X16X128_F8F6F4_" # I # "_w32")>; + defm : WMMAPat<"V_WMMA_SCALE_F32_16X16X128_F8F6F4_" # I # "_w32", int_amdgcn_wmma_scale_f32_16x16x128_f8f6f4, !cast<VOP3PWMMA_Profile>("F32_16X16X128_F8F6F4_SCALE_" # I # "_w32")>; + defm : WMMAPat<"V_WMMA_SCALE16_F32_16X16X128_F8F6F4_" # I # "_w32", int_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4, !cast<VOP3PWMMA_Profile>("F32_16X16X128_F8F6F4_SCALE16_" # I # "_w32")>; } def : SWMMACPat<V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>; @@ -2105,6 +2154,82 @@ multiclass VOP3P_Real_WMMA_gfx1250_SrcFormats<bits<8> op, string WMMAP> { } } +class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VOP3Pe_Base { + bits<9> scale_src0; + bits<9> scale_src1; + + // Inst{7-0} = unused + let Inst{10-8} = {0, matrix_b_scale_fmt{1-0}}; // neg_hi + let Inst{11} = matrix_a_scale{0}; // scale_op_sel(0) + let Inst{12} = 0; // scale_op_sel(1) + let Inst{13} = matrix_a_reuse; // scale_op_sel(2) + let Inst{14} = matrix_b_reuse; // scale_op_sel_hi(2) + let Inst{15} = 0; // scale_clamp + let Inst{31-24} = 0xcc; // Encoding + let Inst{23-16} = LdScaleOp; + let Inst{40-32} = scale_src0; + let Inst{49-41} = scale_src1; + let Inst{58-50} = 0; // scale src2 + let Inst{59} = matrix_b_scale{0}; // scale_op_sel_hi(0) + let Inst{60} = 0; // scale_op_sel_hi(1) + let Inst{63-61} = {0, matrix_a_scale_fmt{1-0}}; // neg (lo) + + // The high half of the encoding is the unscaled wmma op. + let Inst{71-64} = vdst; + + let Inst{72} = !if(P.NegHi01, src0_modifiers{1}, 0); // neg_hi src0 + let Inst{73} = !if(P.NegHi01, src1_modifiers{1}, 0); // neg_hi src1 + let Inst{74} = !if(P.NegHi2, src2_modifiers{1}, 0); // neg_hi src2 + + let Inst{77-75} = !if(P.HasMatrixFMT, matrix_a_fmt{2-0}, 0); // op_sel + + let Inst{78,124,123} = !if(P.HasMatrixFMT, matrix_b_fmt{2-0}, 7); // op_sel_hi + let Inst{79} = !if(P.HasClamp, clamp{0}, 0); + + let Inst{87-80} = op; + let Inst{95-88} = 0xcc; //encoding + let Inst{104-96} = !if(P.HasSrc0, src0, 0); + let Inst{113-105} = !if(P.HasSrc1, src1, 0); + let Inst{122-114} = !if(P.HasSrc2, src2, 0); + + // neg_lo + let Inst{125} = !if(P.NegLo01, src0_modifiers{0}, 0); + let Inst{126} = !if(P.NegLo01, src1_modifiers{0}, 0); + let Inst{127} = !if(P.NegLo2, src2_modifiers{0}, 0); +} + +multiclass VOP3PX2_Real_ScaledWMMA_F4<bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> { + defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr"); + let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32, + DecoderNamespace = "GFX1250" in { + def _gfx1250 : VOP3P_Real_Gen<PS, GFX1250Gen, PS.Mnemonic>, + VOP3PX2e <op, LdScaleOp, WMMAP>; + } +} + +multiclass VOP3PX2_Real_ScaledWMMA<bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> { + defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr"); + defvar asmName = !substr(PS.Mnemonic, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32"))); + defvar psName = !substr(NAME, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32"))); + let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32, + DecoderNamespace = "GFX1250" in { + def _gfx1250 : VOP3P_Real_Gen<PS, GFX1250Gen, asmName>, + VOP3PX2e <op, LdScaleOp, WMMAP>, + MFMA_F8F6F4_WithSizeTable_Helper<PS, psName # "_f8_f8_w32_gfx1250"> { + let AsmString = asmName # PS.AsmOperands; + } + } +} + +multiclass VOP3PX2_Real_ScaledWMMA_SrcFormats<bits<8> op, bits<8> LdScaleOp, string WMMAP> { + defm _f8_f8_w32 : VOP3PX2_Real_ScaledWMMA<op, LdScaleOp, !cast<VOP3PWMMA_Profile>(WMMAP # "_f8_f8_w32")>; + foreach I = ["f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in { + let isAsmParserOnly = true in { // Disable ambiguous disassembly. + defm _#I#_w32 : VOP3PX2_Real_ScaledWMMA<op, LdScaleOp, !cast<VOP3PWMMA_Profile>(WMMAP # "_" # I # "_w32")>; + } + } +} + defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>; defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>; defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>; @@ -2180,6 +2305,11 @@ defm V_WMMA_F16_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x087, F16_FP8B defm V_WMMA_F32_32X16X128_F4_w32 : VOP3P_Real_WMMA_gfx1250 <0x088, F32_32X16X128_F4_WMMA_w32>; defm V_WMMA_F32_16X16X128_F8F6F4 : VOP3P_Real_WMMA_gfx1250_SrcFormats<0x033, "F32_16X16X128_F8F6F4">; +defm V_WMMA_SCALE_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x35, "F32_16X16X128_F8F6F4_SCALE">; +defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x3a, "F32_16X16X128_F8F6F4_SCALE16">; + +defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : VOP3PX2_Real_ScaledWMMA_F4<0x088, 0x35, F32_32X16X128_F4_SCALE_w32>; +defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : VOP3PX2_Real_ScaledWMMA_F4<0x088, 0x3a, F32_32X16X128_F4_SCALE16_w32>; defm V_SWMMAC_F32_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x065, F32_F16X64_SWMMAC_w32>; defm V_SWMMAC_F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x066, F32_BF16X64_SWMMAC_w32>; @@ -2283,6 +2413,9 @@ defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3d>; defm V_FMA_MIXLO_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3e>; defm V_FMA_MIXHI_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3f>; +defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3P_Real_gfx1250<0x35>; +defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3P_Real_gfx1250<0x3a>; + let AssemblerPredicate = isGFX1250Plus in def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16", "v_fma_mix_f32">; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index f027ab0..3cad5a1 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -475,17 +475,24 @@ class VOP3Pe_Base { bits<1> index_key_32bit; bits<3> matrix_a_fmt; bits<3> matrix_b_fmt; + bits<1> matrix_a_scale; + bits<1> matrix_b_scale; + bits<2> matrix_a_scale_fmt; + bits<2> matrix_b_scale_fmt; bits<1> matrix_a_reuse; bits<1> matrix_b_reuse; } class VOP3Pe <VOPProfile P> : Enc64, VOP3Pe_Base { let Inst{7-0} = !if(P.HasDst, vdst, 0); - let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0 - let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1 + let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, + !if(P.HasMatrixScale, matrix_b_scale_fmt{0}, 0)); // neg_hi src0 + let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, + !if(P.HasMatrixScale, matrix_b_scale_fmt{1}, 0)); // neg_hi src1 let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2 - let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0) + let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, + !if(P.HasMatrixScale, matrix_a_scale{0}, 0)); // op_sel(0) let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1) let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, !if(P.HasMatrixReuse, matrix_a_reuse, 0)); // op_sel(2) @@ -500,10 +507,17 @@ class VOP3Pe <VOPProfile P> : Enc64, VOP3Pe_Base { let Inst{40-32} = !if(P.HasSrc0, src0, 0); let Inst{49-41} = !if(P.HasSrc1, src1, 0); let Inst{58-50} = !if(P.HasSrc2, src2, 0); - let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(0) - let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(1) - let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo) - let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo) + let Inst{59} = !cond(!and(P.HasSrc0, P.HasOpSel) : src0_modifiers{3}, + P.IsDOT : 1, + P.HasMatrixScale : matrix_b_scale{0}, + 1: ?); // op_sel_hi(0) + let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, + !if(P.HasMatrixScale, 0, + !if(P.IsDOT, 1, ?))); // op_sel_hi(1) + let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, + !if(P.HasMatrixScale, matrix_a_scale_fmt{0}, 0)); // neg (lo) + let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, + !if(P.HasMatrixScale, matrix_a_scale_fmt{1}, 0)); // neg (lo) let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo) } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 9366256..74c7c97 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -669,13 +669,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, // Integer division functions // RTABI chapter 4.3.1 - { RTLIB::SDIV_I8, RTLIB::__aeabi_idiv__i8 }, - { RTLIB::SDIV_I16, RTLIB::__aeabi_idiv__i16 }, - { RTLIB::SDIV_I32, RTLIB::__aeabi_idiv__i32}, + { RTLIB::SDIV_I32, RTLIB::__aeabi_idiv }, { RTLIB::SDIV_I64, RTLIB::__aeabi_ldivmod }, - { RTLIB::UDIV_I8, RTLIB::__aeabi_uidiv__i8 }, - { RTLIB::UDIV_I16, RTLIB::__aeabi_uidiv__i16 }, - { RTLIB::UDIV_I32, RTLIB::__aeabi_uidiv__i32 }, + { RTLIB::UDIV_I32, RTLIB::__aeabi_uidiv }, { RTLIB::UDIV_I64, RTLIB::__aeabi_uldivmod }, }; // clang-format on @@ -741,7 +737,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, const RTLIB::LibcallImpl Impl; } LibraryCalls[] = { {RTLIB::FPROUND_F32_F16, RTLIB::__aeabi_f2h}, - {RTLIB::FPROUND_F64_F16, RTLIB::__aeabi_d2h}, {RTLIB::FPEXT_F16_F32, RTLIB::__aeabi_h2f}, }; @@ -21363,7 +21358,9 @@ bool ARMTargetLowering::useLoadStackGuardNode(const Module &M) const { } void ARMTargetLowering::insertSSPDeclarations(Module &M) const { - if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) + RTLIB::LibcallImpl SecurityCheckCookieLibcall = + getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE); + if (SecurityCheckCookieLibcall == RTLIB::Unsupported) return TargetLowering::insertSSPDeclarations(M); // MSVC CRT has a global variable holding security cookie. @@ -21372,23 +21369,32 @@ void ARMTargetLowering::insertSSPDeclarations(Module &M) const { // MSVC CRT has a function to validate security cookie. FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( - "__security_check_cookie", Type::getVoidTy(M.getContext()), - PointerType::getUnqual(M.getContext())); + getLibcallImplName(SecurityCheckCookieLibcall), + Type::getVoidTy(M.getContext()), PointerType::getUnqual(M.getContext())); if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) F->addParamAttr(0, Attribute::AttrKind::InReg); } Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const { - // MSVC CRT has a global variable holding security cookie. - if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) + RTLIB::LibcallImpl SecurityCheckCookieLibcall = + getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE); + if (SecurityCheckCookieLibcall != RTLIB::Unsupported) { + // MSVC CRT has a global variable holding security cookie. + // + // FIXME: We have a libcall entry for the correlated check function, but not + // the global name. return M.getGlobalVariable("__security_cookie"); + } + return TargetLowering::getSDagStackGuard(M); } Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const { // MSVC CRT has a function to validate security cookie. - if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) - return M.getFunction("__security_check_cookie"); + RTLIB::LibcallImpl SecurityCheckCookie = + getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE); + if (SecurityCheckCookie != RTLIB::Unsupported) + return M.getFunction(getLibcallImplName(SecurityCheckCookie)); return TargetLowering::getSSPStackGuardCheck(M); } diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp index e8d0d35..fedf9e2 100644 --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -121,10 +121,10 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { return std::make_unique<ARMElfTargetObjectFile>(); } -static std::string computeDataLayout(const Triple &TT, StringRef CPU, +static std::string computeDataLayout(const Triple &TT, const TargetOptions &Options, bool isLittle) { - auto ABI = ARM::computeTargetABI(TT, CPU, Options.MCOptions.ABIName); + auto ABI = ARM::computeTargetABI(TT, Options.MCOptions.ABIName); std::string Ret; if (isLittle) @@ -202,11 +202,10 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT, std::optional<Reloc::Model> RM, std::optional<CodeModel::Model> CM, CodeGenOptLevel OL, bool isLittle) - : CodeGenTargetMachineImpl(T, computeDataLayout(TT, CPU, Options, isLittle), - TT, CPU, FS, Options, - getEffectiveRelocModel(TT, RM), + : CodeGenTargetMachineImpl(T, computeDataLayout(TT, Options, isLittle), TT, + CPU, FS, Options, getEffectiveRelocModel(TT, RM), getEffectiveCodeModel(CM, CodeModel::Small), OL), - TargetABI(ARM::computeTargetABI(TT, CPU, Options.MCOptions.ABIName)), + TargetABI(ARM::computeTargetABI(TT, Options.MCOptions.ABIName)), TLOF(createTLOF(getTargetTriple())), isLittle(isLittle) { // Default to triple-appropriate float ABI diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index dfa3de3c..cc1c79b 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -296,9 +296,9 @@ static bool needsInterworking(const MCAssembler &Asm, const MCSymbol *Sym, unsigned FixupKind) { // Create relocations for unconditional branches to function symbols with // different execution mode in ELF binaries. - if (!Sym || !Sym->isELF()) + if (!Sym || !Asm.getContext().isELF()) return false; - unsigned Type = cast<MCSymbolELF>(Sym)->getType(); + unsigned Type = static_cast<const MCSymbolELF *>(Sym)->getType(); if ((Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC)) { if (Asm.isThumbFunc(Sym) && (FixupKind == ARM::fixup_arm_uncondbranch)) return true; @@ -1108,9 +1108,8 @@ std::optional<bool> ARMAsmBackend::evaluateFixup(const MCFragment &F, } void ARMAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (IsResolved && shouldForceRelocation(Fixup, Target)) IsResolved = false; maybeAddReloc(F, Fixup, Target, Value, IsResolved); @@ -1124,14 +1123,15 @@ void ARMAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, return; // Doesn't change encoding. const unsigned NumBytes = getFixupKindNumBytes(Kind); - unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // Used to point to big endian bytes. unsigned FullSizeBytes; if (Endian == llvm::endianness::big) { FullSizeBytes = getFixupKindContainerSizeBytes(Kind); - assert((Offset + FullSizeBytes) <= Data.size() && "Invalid fixup size!"); + assert(Fixup.getOffset() + FullSizeBytes <= F.getSize() && + "Invalid fixup size!"); assert(NumBytes <= FullSizeBytes && "Invalid fixup size!"); } @@ -1141,7 +1141,7 @@ void ARMAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, for (unsigned i = 0; i != NumBytes; ++i) { unsigned Idx = Endian == llvm::endianness::little ? i : (FullSizeBytes - 1 - i); - Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff); + Data[Idx] |= uint8_t((Value >> (i * 8)) & 0xff); } } diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h index 07d2cf7..2844232 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h @@ -40,8 +40,7 @@ public: std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &, uint64_t &) override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; unsigned getRelaxedOpcode(unsigned Op, const MCSubtargetInfo &STI) const; diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index 50e9ca1..d914f6e 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -97,8 +97,8 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup, case ARM::S_TLSLDM_FDPIC: case ARM::S_TLSLDO: case ARM::S_TPOFF: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index 6dfe846..0796746 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -614,7 +614,7 @@ public: if (!IsThumb) return Val; - unsigned Type = cast<MCSymbolELF>(Symbol)->getType(); + unsigned Type = static_cast<MCSymbolELF *>(Symbol)->getType(); if ((Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC) && Symbol->isDefined()) getAssembler().setIsThumbFunc(Symbol); @@ -679,7 +679,8 @@ private: } void EmitMappingSymbol(StringRef Name) { - auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name)); + auto *Symbol = + static_cast<MCSymbolELF *>(getContext().createLocalSymbol(Name)); emitLabel(Symbol); Symbol->setType(ELF::STT_NOTYPE); @@ -687,7 +688,8 @@ private: } void emitMappingSymbol(StringRef Name, MCFragment &F, uint64_t Offset) { - auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name)); + auto *Symbol = + static_cast<MCSymbolELF *>(getContext().createLocalSymbol(Name)); emitLabelAtPos(Symbol, SMLoc(), F, Offset); Symbol->setType(ELF::STT_NOTYPE); Symbol->setBinding(ELF::STB_LOCAL); @@ -1088,7 +1090,7 @@ void ARMTargetELFStreamer::emitLabel(MCSymbol *Symbol) { return; Streamer.getAssembler().registerSymbol(*Symbol); - unsigned Type = cast<MCSymbolELF>(Symbol)->getType(); + unsigned Type = static_cast<MCSymbolELF *>(Symbol)->getType(); if (Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC) emitThumbFunc(Symbol); } diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp index 354de8f..8ee3a2d 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp @@ -505,7 +505,7 @@ public: // Remember that the function is a thumb function. Fixup and relocation // values will need adjusted. getStreamer().getAssembler().setIsThumbFunc(Symbol); - cast<MCSymbolMachO>(Symbol)->setThumbFunc(); + static_cast<MCSymbolMachO *>(Symbol)->setThumbFunc(); } }; } // namespace diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp index 38444f9..05a7d03 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp @@ -368,9 +368,8 @@ AVRAsmBackend::createObjectTargetWriter() const { } void AVRAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { // AVR sets the fixup value to bypass the assembly time overflow with a // relocation. if (IsResolved) { @@ -397,14 +396,14 @@ void AVRAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Shift the value into position. Value <<= Info.TargetOffset; - unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. for (unsigned i = 0; i < NumBytes; ++i) { uint8_t mask = (((Value >> (i * 8)) & 0xff)); - Data[Offset + i] |= mask; + Data[i] |= mask; } } diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h index 68c839e..9633669 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h @@ -38,8 +38,7 @@ public: createObjectTargetWriter() const override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; std::optional<MCFixupKind> getFixupKind(StringRef Name) const override; MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp index dda8753..53933f9 100644 --- a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp +++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp @@ -27,8 +27,7 @@ public: ~BPFAsmBackend() override = default; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override; @@ -66,35 +65,32 @@ bool BPFAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count, } void BPFAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { maybeAddReloc(F, Fixup, Target, Value, IsResolved); if (Fixup.getKind() == FK_SecRel_8) { // The Value is 0 for global variables, and the in-section offset // for static variables. Write to the immediate field of the inst. assert(Value <= UINT32_MAX); - support::endian::write<uint32_t>(&Data[Fixup.getOffset() + 4], - static_cast<uint32_t>(Value), + support::endian::write<uint32_t>(Data + 4, static_cast<uint32_t>(Value), Endian); } else if (Fixup.getKind() == FK_Data_4 && !Fixup.isPCRel()) { - support::endian::write<uint32_t>(&Data[Fixup.getOffset()], Value, Endian); + support::endian::write<uint32_t>(Data, Value, Endian); } else if (Fixup.getKind() == FK_Data_8) { - support::endian::write<uint64_t>(&Data[Fixup.getOffset()], Value, Endian); + support::endian::write<uint64_t>(Data, Value, Endian); } else if (Fixup.getKind() == FK_Data_4 && Fixup.isPCRel()) { Value = (uint32_t)((Value - 8) / 8); if (Endian == llvm::endianness::little) { - Data[Fixup.getOffset() + 1] = 0x10; - support::endian::write32le(&Data[Fixup.getOffset() + 4], Value); + Data[1] = 0x10; + support::endian::write32le(Data + 4, Value); } else { - Data[Fixup.getOffset() + 1] = 0x1; - support::endian::write32be(&Data[Fixup.getOffset() + 4], Value); + Data[1] = 0x1; + support::endian::write32be(Data + 4, Value); } } else if (Fixup.getKind() == BPF::FK_BPF_PCRel_4) { // The input Value represents the number of bytes. Value = (uint32_t)((Value - 8) / 8); - support::endian::write<uint32_t>(&Data[Fixup.getOffset() + 4], Value, - Endian); + support::endian::write<uint32_t>(Data + 4, Value, Endian); } else { assert(Fixup.getKind() == FK_Data_2 && Fixup.isPCRel()); @@ -103,8 +99,7 @@ void BPFAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, report_fatal_error("Branch target out of insn range"); Value = (uint16_t)((Value - 8) / 8); - support::endian::write<uint16_t>(&Data[Fixup.getOffset() + 2], Value, - Endian); + support::endian::write<uint16_t>(Data + 2, Value, Endian); } } diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp index 1bd82fad..6964998 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp @@ -197,9 +197,8 @@ std::optional<bool> CSKYAsmBackend::evaluateFixup(const MCFragment &F, } void CSKYAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (IsResolved && shouldForceRelocation(Fixup, Target)) IsResolved = false; maybeAddReloc(F, Fixup, Target, Value, IsResolved); @@ -217,10 +216,10 @@ void CSKYAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Shift the value into position. Value <<= Info.TargetOffset; - unsigned Offset = Fixup.getOffset(); unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8; - assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. @@ -228,14 +227,14 @@ void CSKYAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, bool IsInstFixup = (Kind >= FirstTargetFixupKind); if (IsLittleEndian && IsInstFixup && (NumBytes == 4)) { - Data[Offset + 0] |= uint8_t((Value >> 16) & 0xff); - Data[Offset + 1] |= uint8_t((Value >> 24) & 0xff); - Data[Offset + 2] |= uint8_t(Value & 0xff); - Data[Offset + 3] |= uint8_t((Value >> 8) & 0xff); + Data[0] |= uint8_t((Value >> 16) & 0xff); + Data[1] |= uint8_t((Value >> 24) & 0xff); + Data[2] |= uint8_t(Value & 0xff); + Data[3] |= uint8_t((Value >> 8) & 0xff); } else { for (unsigned I = 0; I != NumBytes; I++) { unsigned Idx = IsLittleEndian ? I : (NumBytes - 1 - I); - Data[Offset + Idx] |= uint8_t((Value >> (I * 8)) & 0xff); + Data[Idx] |= uint8_t((Value >> (I * 8)) & 0xff); } } } diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h index 1c8516f..5d8826a 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h @@ -25,8 +25,7 @@ public: std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &, uint64_t &) override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp index d042d26..4667975f 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp @@ -48,8 +48,8 @@ unsigned CSKYELFObjectWriter::getRelocType(const MCFixup &Fixup, case CSKY::S_TLSGD: case CSKY::S_TLSLDM: case CSKY::S_TLSLDO: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp index 346b123..397cf16 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp @@ -169,7 +169,8 @@ void CSKYELFStreamer::EmitMappingSymbol(StringRef Name) { State = (Name == "$t" ? EMS_Text : EMS_Data); - auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name)); + auto *Symbol = + static_cast<MCSymbolELF *>(getContext().createLocalSymbol(Name)); emitLabel(Symbol); Symbol->setType(ELF::STT_NOTYPE); diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index b6e8ce7..26a113d 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -103,7 +103,7 @@ GlobalVariable *DXContainerGlobals::computeShaderHash(Module &M) { dxbc::ShaderHash HashData = {0, {0}}; // The Hash's IncludesSource flag gets set whenever the hashed shader includes // debug information. - if (M.debug_compile_units_begin() != M.debug_compile_units_end()) + if (!M.debug_compile_units().empty()) HashData.Flags = static_cast<uint32_t>(dxbc::HashFlags::IncludesSource); memcpy(reinterpret_cast<void *>(&HashData.Digest), Result.data(), 16); diff --git a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp index 5323be6..9a14c01 100644 --- a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp +++ b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp @@ -78,8 +78,7 @@ public: ~DXILAsmBackend() override = default; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override {} + uint8_t *Data, uint64_t Value, bool IsResolved) override {} std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override { diff --git a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp index 102f1c6..14b6bb3 100644 --- a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp +++ b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp @@ -330,7 +330,7 @@ bool HexagonCommonGEP::isHandledGepForm(GetElementPtrInst *GepI) { if (!GepI->getType()->isPointerTy()) return false; // No GEPs without any indices. (Is this possible?) - if (GepI->idx_begin() == GepI->idx_end()) + if (GepI->indices().empty()) return false; return true; } diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp index 52fa678..613048b 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp @@ -1987,7 +1987,7 @@ SmallVector<uint32_t, 8> HvxSelector::getPerfectCompletions(ShuffleMask SM, // times). In such cases it will be impossible to complete this to a // perfect shuffle. SmallVector<uint32_t, 8> Sorted(Worklist); - llvm::sort(Sorted.begin(), Sorted.end()); + llvm::sort(Sorted); for (unsigned I = 0, E = Sorted.size(); I != E;) { unsigned P = Sorted[I], Count = 1; diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp index d5b7a75..1a0f1ab 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp @@ -402,8 +402,7 @@ public: } void applyFixup(const MCFragment &, const MCFixup &, const MCValue &, - MutableArrayRef<char> Data, uint64_t FixupValue, - bool IsResolved) override; + uint8_t *Data, uint64_t FixupValue, bool IsResolved) override; bool isInstRelaxable(MCInst const &HMI) const { const MCInstrDesc &MCID = HexagonMCInstrInfo::getDesc(*MCII, HMI); @@ -649,8 +648,7 @@ public: } // namespace void HexagonAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, + const MCValue &Target, uint8_t *InstAddr, uint64_t FixupValue, bool IsResolved) { if (IsResolved && shouldForceRelocation(Fixup)) IsResolved = false; @@ -667,10 +665,9 @@ void HexagonAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // LLVM gives us an encoded value, we have to convert it back // to a real offset before we can use it. - uint32_t Offset = Fixup.getOffset(); unsigned NumBytes = getFixupKindNumBytes(Kind); - assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); - char *InstAddr = Data.data() + Offset; + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); Value = adjustFixupValue(Kind, FixupValue); if (!Value) @@ -757,8 +754,8 @@ void HexagonAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, uint32_t OldData = 0; for (unsigned i = 0; i < NumBytes; i++) OldData |= (InstAddr[i] << (i * 8)) & (0xff << (i * 8)); dbgs() << "\tBValue=0x"; dbgs().write_hex(Value) << ": AValue=0x"; - dbgs().write_hex(FixupValue) - << ": Offset=" << Offset << ": Size=" << Data.size() << ": OInst=0x"; + dbgs().write_hex(FixupValue) << ": Offset=" << Fixup.getOffset() + << ": Size=" << F.getSize() << ": OInst=0x"; dbgs().write_hex(OldData) << ": Reloc=0x"; dbgs().write_hex(Reloc);); // For each byte of the fragment that the fixup touches, mask in the diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp index 9752f3a..af97ea2 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp @@ -50,8 +50,8 @@ unsigned HexagonELFObjectWriter::getRelocType(const MCFixup &Fixup, case HexagonMCExpr::VK_IE: case HexagonMCExpr::VK_IE_GOT: case HexagonMCExpr::VK_TPREL: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp index 13ecc23..039ef4f 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp @@ -96,7 +96,7 @@ void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol, getAssembler().registerSymbol(*Symbol); StringRef sbss[4] = {".sbss.1", ".sbss.2", ".sbss.4", ".sbss.8"}; - auto ELFSymbol = cast<MCSymbolELF>(Symbol); + auto ELFSymbol = static_cast<MCSymbolELF *>(Symbol); if (!ELFSymbol->isBindingSet()) ELFSymbol->setBinding(ELF::STB_GLOBAL); @@ -143,7 +143,7 @@ void HexagonMCELFStreamer::HexagonMCEmitLocalCommonSymbol(MCSymbol *Symbol, Align ByteAlignment, unsigned AccessSize) { getAssembler().registerSymbol(*Symbol); - auto ELFSymbol = cast<MCSymbolELF>(Symbol); + auto ELFSymbol = static_cast<const MCSymbolELF *>(Symbol); ELFSymbol->setBinding(ELF::STB_LOCAL); ELFSymbol->setExternal(false); HexagonMCEmitCommonSymbol(Symbol, Size, ByteAlignment, AccessSize); diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp index 83d1697..3112dea 100644 --- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp +++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp @@ -48,8 +48,7 @@ public: : MCAsmBackend(llvm::endianness::big), OSType(OST) {} void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override; @@ -72,9 +71,8 @@ bool LanaiAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count, } void LanaiAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (!IsResolved) Asm->getWriter().recordRelocation(F, Fixup, Target, Value); @@ -85,7 +83,6 @@ void LanaiAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Where in the object and where the number of bytes that need // fixing up - unsigned Offset = Fixup.getOffset(); unsigned NumBytes = (getFixupKindInfo(Kind).TargetSize + 7) / 8; unsigned FullSize = 4; @@ -95,8 +92,7 @@ void LanaiAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Load instruction and apply value for (unsigned i = 0; i != NumBytes; ++i) { unsigned Idx = (FullSize - 1 - i); - CurVal |= static_cast<uint64_t>(static_cast<uint8_t>(Data[Offset + Idx])) - << (i * 8); + CurVal |= static_cast<uint64_t>(static_cast<uint8_t>(Data[Idx])) << (i * 8); } uint64_t Mask = @@ -106,7 +102,7 @@ void LanaiAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Write out the fixed up bytes back to the code/data bits. for (unsigned i = 0; i != NumBytes; ++i) { unsigned Idx = (FullSize - 1 - i); - Data[Offset + Idx] = static_cast<uint8_t>((CurVal >> (i * 8)) & 0xff); + Data[Idx] = static_cast<uint8_t>((CurVal >> (i * 8)) & 0xff); } } diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index 5096a8f..d8bb16f 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -1651,20 +1651,19 @@ def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm), (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>; def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm), (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>; -def : Pat<(vector_insert v8f32:$xd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm3:$imm), - (XVINSGR2VR_W $xd, $rj, uimm3:$imm)>; -def : Pat<(vector_insert v4f64:$xd, (f64 (bitconvert i64:$rj)), uimm2:$imm), - (XVINSGR2VR_D $xd, $rj, uimm2:$imm)>; -def : Pat<(vector_insert v8f32:$xd, (f32 (vector_extract v8f32:$xj, uimm3:$imm1)), uimm3:$imm2), - (XVINSGR2VR_W $xd, (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm1), uimm3:$imm2)>; -def : Pat<(vector_insert v4f64:$xd, (f64 (vector_extract v4f64:$xj, uimm2:$imm1)), uimm2:$imm2), - (XVINSGR2VR_D $xd, (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm1), uimm2:$imm2)>; +def : Pat<(vector_insert v8f32:$xd, (loongarch_movgr2fr_w_la64 GPR:$rj), + uimm3:$imm), + (XVINSGR2VR_W v8f32:$xd, GPR:$rj, uimm3:$imm)>; +def : Pat<(vector_insert v4f64:$xd, (f64(bitconvert i64:$rj)), uimm2:$imm), + (XVINSGR2VR_D v4f64:$xd, GPR:$rj, uimm2:$imm)>; // XVINSVE0_{W/D} def : Pat<(vector_insert v8f32:$xd, FPR32:$fj, uimm3:$imm), - (XVINSVE0_W $xd, (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), uimm3:$imm)>; + (XVINSVE0_W v8f32:$xd, (SUBREG_TO_REG(i64 0), FPR32:$fj, sub_32), + uimm3:$imm)>; def : Pat<(vector_insert v4f64:$xd, FPR64:$fj, uimm2:$imm), - (XVINSVE0_D $xd, (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), uimm2:$imm)>; + (XVINSVE0_D v4f64:$xd, (SUBREG_TO_REG(i64 0), FPR64:$fj, sub_64), + uimm2:$imm)>; // scalar_to_vector def : Pat<(v8f32 (scalar_to_vector FPR32:$fj)), @@ -1884,10 +1883,10 @@ def : Pat<(i64 (vector_extract v8i32:$xj, uimm3:$imm)), (XVPICKVE2GR_W v8i32:$xj, uimm3:$imm)>; def : Pat<(i64 (vector_extract v4i64:$xj, uimm2:$imm)), (XVPICKVE2GR_D v4i64:$xj, uimm2:$imm)>; -def : Pat<(f32 (vector_extract v8f32:$xj, uimm3:$imm)), - (MOVGR2FR_W (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm))>; -def : Pat<(f64 (vector_extract v4f64:$xj, uimm2:$imm)), - (MOVGR2FR_D (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm))>; +def : Pat<(f32(vector_extract v8f32:$xj, uimm3:$imm)), + (EXTRACT_SUBREG(XVPICKVE_W v8f32:$xj, uimm3:$imm), sub_32)>; +def : Pat<(f64(vector_extract v4f64:$xj, uimm2:$imm)), + (EXTRACT_SUBREG(XVPICKVE_D v4f64:$xj, uimm2:$imm), sub_64)>; // vselect def : Pat<(v32i8 (vselect LASX256:$xd, (v32i8 (SplatPat_uimm8 uimm8:$imm)), diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index 858f3d0..fda9d97 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -131,19 +131,18 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, } } -static void fixupLeb128(MCContext &Ctx, const MCFixup &Fixup, - MutableArrayRef<char> Data, uint64_t Value) { +static void fixupLeb128(MCContext &Ctx, const MCFixup &Fixup, uint8_t *Data, + uint64_t Value) { unsigned I; - for (I = 0; I != Data.size() && Value; ++I, Value >>= 7) + for (I = 0; Value; ++I, Value >>= 7) Data[I] |= uint8_t(Value & 0x7f); if (Value) Ctx.reportError(Fixup.getLoc(), "Invalid uleb128 value!"); } void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (IsResolved && shouldForceRelocation(Fixup, Target)) IsResolved = false; IsResolved = addReloc(F, Fixup, Target, Value, IsResolved); @@ -166,14 +165,14 @@ void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Shift the value into position. Value <<= Info.TargetOffset; - unsigned Offset = Fixup.getOffset(); unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8; - assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. for (unsigned I = 0; I != NumBytes; ++I) { - Data[Offset + I] |= uint8_t((Value >> (I * 8)) & 0xff); + Data[I] |= uint8_t((Value >> (I * 8)) & 0xff); } } @@ -274,15 +273,14 @@ bool LoongArchAsmBackend::relaxDwarfLineAddr(MCFragment &F, int64_t LineDelta = F.getDwarfLineDelta(); const MCExpr &AddrDelta = F.getDwarfAddrDelta(); - SmallVector<MCFixup, 1> Fixups; size_t OldSize = F.getVarSize(); int64_t Value; if (AddrDelta.evaluateAsAbsolute(Value, *Asm)) return false; - bool IsAbsolute = AddrDelta.evaluateKnownAbsolute(Value, *Asm); - assert(IsAbsolute && "CFA with invalid expression"); - (void)IsAbsolute; + [[maybe_unused]] bool IsAbsolute = + AddrDelta.evaluateKnownAbsolute(Value, *Asm); + assert(IsAbsolute); SmallVector<char> Data; raw_svector_ostream OS(Data); @@ -293,33 +291,23 @@ bool LoongArchAsmBackend::relaxDwarfLineAddr(MCFragment &F, encodeSLEB128(LineDelta, OS); } - unsigned Offset; - std::pair<MCFixupKind, MCFixupKind> FK; - // According to the DWARF specification, the `DW_LNS_fixed_advance_pc` opcode // takes a single unsigned half (unencoded) operand. The maximum encodable // value is therefore 65535. Set a conservative upper bound for relaxation. + unsigned PCBytes; if (Value > 60000) { unsigned PtrSize = C.getAsmInfo()->getCodePointerSize(); - - OS << uint8_t(dwarf::DW_LNS_extended_op); - encodeULEB128(PtrSize + 1, OS); - - OS << uint8_t(dwarf::DW_LNE_set_address); - Offset = OS.tell(); assert((PtrSize == 4 || PtrSize == 8) && "Unexpected pointer size"); - FK = getRelocPairForSize(PtrSize == 4 ? 32 : 64); + PCBytes = PtrSize; + OS << uint8_t(dwarf::DW_LNS_extended_op) << uint8_t(PtrSize + 1) + << uint8_t(dwarf::DW_LNE_set_address); OS.write_zeros(PtrSize); } else { + PCBytes = 2; OS << uint8_t(dwarf::DW_LNS_fixed_advance_pc); - Offset = OS.tell(); - FK = getRelocPairForSize(16); support::endian::write<uint16_t>(OS, 0, llvm::endianness::little); } - - const MCBinaryExpr &MBE = cast<MCBinaryExpr>(AddrDelta); - Fixups.push_back(MCFixup::create(Offset, MBE.getLHS(), std::get<0>(FK))); - Fixups.push_back(MCFixup::create(Offset, MBE.getRHS(), std::get<1>(FK))); + auto Offset = OS.tell() - PCBytes; if (LineDelta == INT64_MAX) { OS << uint8_t(dwarf::DW_LNS_extended_op); @@ -330,7 +318,8 @@ bool LoongArchAsmBackend::relaxDwarfLineAddr(MCFragment &F, } F.setVarContents(Data); - F.setVarFixups(Fixups); + F.setVarFixups({MCFixup::create(Offset, &AddrDelta, + MCFixup::getDataKindForSize(PCBytes))}); WasRelaxed = OldSize != Data.size(); return true; } diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h index 3d929fc..1f13601 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h @@ -42,8 +42,7 @@ public: uint64_t &FixedValue, bool IsResolved); void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool shouldForceRelocation(const MCFixup &Fixup, const MCValue &Target); diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp index fb741af..7e021e4 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp @@ -61,8 +61,8 @@ unsigned LoongArchELFObjectWriter::getRelocType(const MCFixup &Fixup, case ELF::R_LARCH_TLS_LD_PCREL20_S2: case ELF::R_LARCH_TLS_GD_PCREL20_S2: case ELF::R_LARCH_TLS_DESC_PCREL20_S2: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp index 7ef705d..fe83dc6 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp @@ -53,8 +53,7 @@ public: .Default(false)) {} void applyFixup(const MCFragment &, const MCFixup &, const MCValue &, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool mayNeedRelaxation(unsigned Opcode, ArrayRef<MCOperand> Operands, const MCSubtargetInfo &STI) const override; @@ -78,9 +77,8 @@ public: } // end anonymous namespace void M68kAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (!IsResolved) Asm->getWriter().recordRelocation(F, Fixup, Target, Value); @@ -95,8 +93,7 @@ void M68kAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Write in Big Endian for (unsigned i = 0; i != Size; ++i) - Data[Fixup.getOffset() + i] = - uint8_t(static_cast<int64_t>(Value) >> ((Size - i - 1) * 8)); + Data[i] = uint8_t(static_cast<int64_t>(Value) >> ((Size - i - 1) * 8)); } /// cc—Carry clear GE—Greater than or equal diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp index ca94a47..d070409 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp @@ -70,8 +70,8 @@ unsigned M68kELFObjectWriter::getRelocType(const MCFixup &Fixup, case M68k::S_TLSLD: case M68k::S_TLSLDM: case M68k::S_TPOFF: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp index b513503..d892b3a 100644 --- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp +++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp @@ -36,8 +36,7 @@ public: ~MSP430AsmBackend() override = default; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override { @@ -105,9 +104,8 @@ uint64_t MSP430AsmBackend::adjustFixupValue(const MCFixup &Fixup, } void MSP430AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { maybeAddReloc(F, Fixup, Target, Value, IsResolved); Value = adjustFixupValue(Fixup, Value, getContext()); MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind()); @@ -117,15 +115,14 @@ void MSP430AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Shift the value into position. Value <<= Info.TargetOffset; - unsigned Offset = Fixup.getOffset(); unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8; - - assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. for (unsigned i = 0; i != NumBytes; ++i) { - Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); + Data[i] |= uint8_t((Value >> (i * 8)) & 0xff); } } diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 259b71b..7b2ee83 100644 --- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -2948,8 +2948,9 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr, bool IsPtr64 = ABI.ArePtrs64bit(); bool IsLocalSym = Res.getAddSym()->isInSection() || Res.getAddSym()->isTemporary() || - (Res.getAddSym()->isELF() && - cast<MCSymbolELF>(Res.getAddSym())->getBinding() == ELF::STB_LOCAL); + (getContext().isELF() && + static_cast<const MCSymbolELF *>(Res.getAddSym())->getBinding() == + ELF::STB_LOCAL); // For O32, "$"-prefixed symbols are recognized as temporary while // .L-prefixed symbols are not (PrivateGlobalPrefix is "$"). Recognize ".L" // manually. @@ -6653,7 +6654,7 @@ bool MipsAsmParser::searchSymbolAlias(OperandVector &Operands) { llvm_unreachable("Should never fail"); } } - } else if (Sym->isUnset()) { + } else if (Sym->isUndefined()) { // If symbol is unset, it might be created in the `parseSetAssignment` // routine as an alias for a numeric register name. // Lookup in the aliases list. diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp index c2169be..33aab71 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp @@ -283,9 +283,8 @@ static bool shouldForceRelocation(const MCFixup &Fixup) { /// data fragment, at the offset specified by the fixup and following the /// fixup kind as appropriate. void MipsAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (shouldForceRelocation(Fixup)) IsResolved = false; maybeAddReloc(F, Fixup, Target, Value, IsResolved); @@ -297,7 +296,6 @@ void MipsAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, return; // Doesn't change encoding. // Where do we start in the object - unsigned Offset = Fixup.getOffset(); // Number of bytes we need to fixup unsigned NumBytes = (getFixupKindInfo(Kind).TargetSize + 7) / 8; // Used to point to big endian bytes @@ -328,7 +326,7 @@ void MipsAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, unsigned Idx = Endian == llvm::endianness::little ? (microMipsLEByteOrder ? calculateMMLEIndex(i) : i) : (FullSize - 1 - i); - CurVal |= (uint64_t)((uint8_t)Data[Offset + Idx]) << (i*8); + CurVal |= (uint64_t)((uint8_t)Data[Idx]) << (i * 8); } uint64_t Mask = ((uint64_t)(-1) >> @@ -340,7 +338,7 @@ void MipsAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, unsigned Idx = Endian == llvm::endianness::little ? (microMipsLEByteOrder ? calculateMMLEIndex(i) : i) : (FullSize - 1 - i); - Data[Offset + Idx] = (uint8_t)((CurVal >> (i*8)) & 0xff); + Data[Idx] = (uint8_t)((CurVal >> (i * 8)) & 0xff); } } diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h index 816626d..40b5853 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h @@ -40,8 +40,7 @@ public: createObjectTargetWriter() const override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; std::optional<MCFixupKind> getFixupKind(StringRef Name) const override; MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp index 7abe9c9..16247bd 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp @@ -166,8 +166,8 @@ unsigned MipsELFObjectWriter::getRelocType(const MCFixup &Fixup, case Mips::S_GOTTPREL: case Mips::S_TPREL_HI: case Mips::S_TPREL_LO: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; default: break; @@ -450,6 +450,7 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCValue &V, needsRelocateWithSymbol(V, (Type >> 8) & 0xff) || needsRelocateWithSymbol(V, (Type >> 16) & 0xff); + auto *Sym = static_cast<const MCSymbolELF *>(V.getAddSym()); switch (Type) { default: errs() << Type << "\n"; @@ -481,7 +482,7 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCValue &V, // FIXME: It should be safe to return false for the STO_MIPS_MICROMIPS but // we neglect to handle the adjustment to the LSB of the addend that // it causes in applyFixup() and similar. - if (cast<MCSymbolELF>(V.getAddSym())->getOther() & ELF::STO_MIPS_MICROMIPS) + if (Sym->getOther() & ELF::STO_MIPS_MICROMIPS) return true; return false; @@ -492,7 +493,7 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCValue &V, case ELF::R_MIPS_16: case ELF::R_MIPS_32: case ELF::R_MIPS_GPREL32: - if (cast<MCSymbolELF>(V.getAddSym())->getOther() & ELF::STO_MIPS_MICROMIPS) + if (Sym->getOther() & ELF::STO_MIPS_MICROMIPS) return true; [[fallthrough]]; case ELF::R_MIPS_26: diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp index e8b9746..feeadc5e 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp @@ -76,7 +76,7 @@ void MipsELFStreamer::createPendingLabelRelocs() { // FIXME: Also mark labels when in MIPS16 mode. if (ELFTargetStreamer->isMicroMipsEnabled()) { for (auto *L : Labels) { - auto *Label = cast<MCSymbolELF>(L); + auto *Label = static_cast<MCSymbolELF *>(L); getAssembler().registerSymbol(*Label); Label->setOther(ELF::STO_MIPS_MICROMIPS); } diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index d9680c7..5df70c4 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -931,7 +931,7 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S, } void MipsTargetELFStreamer::emitLabel(MCSymbol *S) { - auto *Symbol = cast<MCSymbolELF>(S); + auto *Symbol = static_cast<MCSymbolELF *>(S); getStreamer().getAssembler().registerSymbol(*Symbol); uint8_t Type = Symbol->getType(); if (Type != ELF::STT_FUNC) @@ -1015,11 +1015,11 @@ void MipsTargetELFStreamer::finish() { } void MipsTargetELFStreamer::emitAssignment(MCSymbol *S, const MCExpr *Value) { - auto *Symbol = cast<MCSymbolELF>(S); + auto *Symbol = static_cast<MCSymbolELF *>(S); // If on rhs is micromips symbol then mark Symbol as microMips. if (Value->getKind() != MCExpr::SymbolRef) return; - const auto &RhsSym = cast<MCSymbolELF>( + auto &RhsSym = static_cast<const MCSymbolELF &>( static_cast<const MCSymbolRefExpr *>(Value)->getSymbol()); if (!(RhsSym.getOther() & ELF::STO_MIPS_MICROMIPS)) @@ -1034,12 +1034,14 @@ MCELFStreamer &MipsTargetELFStreamer::getStreamer() { void MipsTargetELFStreamer::emitGPRel32Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(4); S.addFixup(Value, Mips::fixup_Mips_GPREL32); S.appendContents(4, 0); } void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(8); // fixup_Mips_GPREL32 desginates R_MIPS_GPREL32+R_MIPS_64 on MIPS64. S.addFixup(Value, Mips::fixup_Mips_GPREL32); S.appendContents(8, 0); @@ -1047,24 +1049,28 @@ void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) { void MipsTargetELFStreamer::emitDTPRel32Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(4); S.addFixup(Value, Mips::fixup_Mips_DTPREL32); S.appendContents(4, 0); } void MipsTargetELFStreamer::emitDTPRel64Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(8); S.addFixup(Value, Mips::fixup_Mips_DTPREL64); S.appendContents(8, 0); } void MipsTargetELFStreamer::emitTPRel32Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(4); S.addFixup(Value, Mips::fixup_Mips_TPREL32); S.appendContents(4, 0); } void MipsTargetELFStreamer::emitTPRel64Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(8); S.addFixup(Value, Mips::fixup_Mips_TPREL64); S.appendContents(8, 0); } diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index a2e48ab..4530fc6 100644 --- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -1052,8 +1052,7 @@ void MipsAsmPrinter::EmitFPCallStub( // __call_stub_fp_xxxx: // std::string x = "__call_stub_fp_" + std::string(Symbol); - MCSymbolELF *Stub = - cast<MCSymbolELF>(OutContext.getOrCreateSymbol(StringRef(x))); + MCSymbol *Stub = OutContext.getOrCreateSymbol(StringRef(x)); TS.emitDirectiveEnt(*Stub); MCSymbol *MType = OutContext.getOrCreateSymbol("__call_stub_fp_" + Twine(Symbol)); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 2ae7520..aac611d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -151,6 +151,8 @@ class OneUse2<SDPatternOperator operator> class fpimm_pos_inf<ValueType vt> : FPImmLeaf<vt, [{ return Imm.isPosInfinity(); }]>; +class zeroinitializer<ValueType vt> : + PatLeaf<(vt (bitconvert (!cast<ValueType>("i" # vt.Size) 0)))>; // Operands which can hold a Register or an Immediate. @@ -789,6 +791,23 @@ def UMAX16x2 : I16x2<"max.u", umax>; def SMIN16x2 : I16x2<"min.s", smin>; def UMIN16x2 : I16x2<"min.u", umin>; +let Predicates = [hasPTX<80>, hasSM<90>] in { + + def MIN_RELU_S32 : BasicNVPTXInst<(outs B32:$dst), (ins B32:$a, B32:$b), + "min.relu.s32", + [(set i32:$dst, (smax (smin i32:$a, i32:$b), 0))]>; + def MAX_RELU_S32 : BasicNVPTXInst<(outs B32:$dst), (ins B32:$a, B32:$b), + "max.relu.s32", + [(set i32:$dst, (smax (smax i32:$a, i32:$b), 0))]>; + def MIN_RELU_S16x2 : BasicNVPTXInst<(outs B32:$dst), (ins B32:$a, B32:$b), + "min.relu.s16x2", + [(set v2i16:$dst, (smax (smin v2i16:$a, v2i16:$b), + zeroinitializer<v2i16>))]>; + def MAX_RELU_S16x2 : BasicNVPTXInst<(outs B32:$dst), (ins B32:$a, B32:$b), + "max.relu.s16x2", + [(set v2i16:$dst, (smax (smax v2i16:$a, v2i16:$b), + zeroinitializer<v2i16>))]>; +} // // Wide multiplication @@ -1541,18 +1560,6 @@ def : Pat<(setcc (i16 (sext_inreg (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel (PRMT_B32rii i32:$b, 0, (to_sign_extend_selector $sel_b), PrmtNONE), (cond2cc $cc))>; -// A 16-bit comparison of truncated byte extracts can be be converted to 32-bit -// comparison because we know that the truncate is just trancating off zeros -// and that the most-significant byte is also zeros so the meaning of signed and -// unsigned comparisons will not be changed. -def : Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), - (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), - cond:$cc), - (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), - (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), - (cond2cc $cc))>; - - def SDTDeclareArrayParam : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>; def SDTDeclareScalarParam : @@ -2379,9 +2386,6 @@ def fpimm_any_zero : FPImmLeaf<fAny, [{ return Imm.isZero(); }]>; -def fpimm_positive_zero_v2f16 : PatFrag<(ops), (v2f16 (bitconvert (i32 0)))>; -def fpimm_positive_zero_v2bf16 : PatFrag<(ops), (v2bf16 (bitconvert (i32 0)))>; - // Perform substitution if fma only has one use, and also if instruction has // nnan instruction flag or if the TM has NoNaNsFPMath def NVPTX_fma_oneuse_and_nnan : PatFrag<(ops node:$a, node:$b, node:$c), @@ -2404,10 +2408,10 @@ class FMARELUInst<RegTyInfo t, bit allow_ftz, PatFrag zero_pat> let Predicates = [useFP16Math, hasPTX<70>, hasSM<80>] in { def FMARELU_F16 : FMARELUInst<F16RT, true, fpimm_any_zero>; - def FMARELU_F16X2 : FMARELUInst<F16X2RT, true, fpimm_positive_zero_v2f16>; + def FMARELU_F16X2 : FMARELUInst<F16X2RT, true, zeroinitializer<v2f16>>; } let Predicates = [hasBF16Math, hasPTX<70>, hasSM<80>] in { def FMARELU_BF16 : FMARELUInst<BF16RT, false, fpimm_any_zero>; - def FMARELU_BF16X2 : FMARELUInst<BF16X2RT, false, fpimm_positive_zero_v2bf16>; + def FMARELU_BF16X2 : FMARELUInst<BF16X2RT, false, zeroinitializer<v2bf16>>; } diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index 58766b1..1fc475d 100644 --- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -1756,7 +1756,7 @@ bool PPCAsmParser::parseDirectiveLocalEntry(SMLoc L) { if (getParser().parseIdentifier(Name)) return Error(L, "expected identifier in '.localentry' directive"); - MCSymbolELF *Sym = cast<MCSymbolELF>(getContext().getOrCreateSymbol(Name)); + auto *Sym = static_cast<MCSymbolELF *>(getContext().getOrCreateSymbol(Name)); const MCExpr *Expr; if (parseToken(AsmToken::Comma) || diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp index 0e8828f..04b886a 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp @@ -13,6 +13,7 @@ #include "llvm/BinaryFormat/MachO.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCMachObjectWriter.h" #include "llvm/MC/MCObjectWriter.h" @@ -93,8 +94,8 @@ public: MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; void applyFixup(const MCFragment &, const MCFixup &Fixup, - const MCValue &Target, MutableArrayRef<char> Data, - uint64_t Value, bool IsResolved) override; + const MCValue &Target, uint8_t *Data, uint64_t Value, + bool IsResolved) override; bool shouldForceRelocation(const MCFixup &Fixup, const MCValue &Target) { // If there is a @ specifier, unless it is optimized out (e.g. constant @l), @@ -112,14 +113,15 @@ public: // to resolve the fixup directly. Emit a relocation and leave // resolution of the final target address to the linker. if (const auto *A = Target.getAddSym()) { - if (const auto *S = dyn_cast<MCSymbolELF>(A)) { + if (getContext().isELF()) { // The "other" values are stored in the last 6 bits of the second // byte. The traditional defines for STO values assume the full byte // and thus the shift to pack it. - unsigned Other = S->getOther() << 2; + unsigned Other = static_cast<const MCSymbolELF *>(A)->getOther() << 2; if ((Other & ELF::STO_PPC64_LOCAL_MASK) != 0) return true; - } else if (const auto *S = dyn_cast<MCSymbolXCOFF>(A)) { + } else if (getContext().isXCOFF()) { + auto *S = static_cast<const MCSymbolXCOFF *>(A); return !Target.isAbsolute() && S->isExternal() && S->getStorageClass() == XCOFF::C_WEAKEXT; } @@ -185,9 +187,8 @@ MCFixupKindInfo PPCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { } void PPCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &TargetVal, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &TargetVal, uint8_t *Data, + uint64_t Value, bool IsResolved) { // In PPC64 ELFv1, .quad .TOC.@tocbase in the .opd section is expected to // reference the null symbol. auto Target = TargetVal; @@ -205,7 +206,6 @@ void PPCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, if (!Value) return; // Doesn't change encoding. - unsigned Offset = Fixup.getOffset(); unsigned NumBytes = getFixupKindNumBytes(Kind); // For each byte of the fragment that the fixup touches, mask in the bits @@ -213,7 +213,7 @@ void PPCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // bitfields above. for (unsigned i = 0; i != NumBytes; ++i) { unsigned Idx = Endian == llvm::endianness::little ? i : (NumBytes - 1 - i); - Data[Offset + i] |= uint8_t((Value >> (Idx * 8)) & 0xff); + Data[i] |= uint8_t((Value >> (Idx * 8)) & 0xff); } } diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp index a5d3be4..329ad6e 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp @@ -86,8 +86,8 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup, case PPC::S_TPREL_HIGHEST: case PPC::S_TPREL_HIGHESTA: case PPC::S_TPREL_LO: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; default: break; @@ -499,7 +499,8 @@ bool PPCELFObjectWriter::needsRelocateWithSymbol(const MCValue &V, // The "other" values are stored in the last 6 bits of the second byte. // The traditional defines for STO values assume the full byte and thus // the shift to pack it. - unsigned Other = cast<MCSymbolELF>(V.getAddSym())->getOther() << 2; + unsigned Other = + static_cast<const MCSymbolELF *>(V.getAddSym())->getOther() << 2; return (Other & ELF::STO_PPC64_LOCAL_MASK) != 0; } diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp index 2dbc31f..132d5a4 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp @@ -65,7 +65,7 @@ void PPCELFStreamer::emitPrefixedInstruction(const MCInst &Inst, MCFragment *InstructionFragment = getCurrentFragment(); SMLoc InstLoc = Inst.getLoc(); // Check if there was a last label emitted. - if (LastLabel && !LastLabel->isUnset() && LastLabelLoc.isValid() && + if (LastLabel && LastLabel->isDefined() && LastLabelLoc.isValid() && InstLoc.isValid()) { const SourceMgr *SourceManager = getContext().getSourceManager(); unsigned InstLine = SourceManager->FindLineNumber(InstLoc); diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp index 3dad0e8..d856c3f 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp @@ -211,7 +211,7 @@ public: : PPCTargetStreamer(S), OS(OS) {} void emitTCEntry(const MCSymbol &S, PPCMCExpr::Specifier Kind) override { - if (const MCSymbolXCOFF *XSym = dyn_cast<MCSymbolXCOFF>(&S)) { + if (getContext().isXCOFF()) { MCSymbolXCOFF *TCSym = static_cast<const MCSectionXCOFF *>(Streamer.getCurrentSectionOnly()) ->getQualNameSymbol(); @@ -225,10 +225,10 @@ public: if (Kind == PPC::S_AIX_TLSGD || Kind == PPC::S_AIX_TLSGDM || Kind == PPC::S_AIX_TLSIE || Kind == PPC::S_AIX_TLSLE || Kind == PPC::S_AIX_TLSLD || Kind == PPC::S_AIX_TLSML) - OS << "\t.tc " << TCSym->getName() << "," << XSym->getName() << "@" + OS << "\t.tc " << TCSym->getName() << "," << S.getName() << "@" << getContext().getAsmInfo()->getSpecifierName(Kind) << '\n'; else - OS << "\t.tc " << TCSym->getName() << "," << XSym->getName() << '\n'; + OS << "\t.tc " << TCSym->getName() << "," << S.getName() << '\n'; if (TCSym->hasRename()) Streamer.emitXCOFFRenameDirective(TCSym, TCSym->getSymbolTableName()); @@ -308,7 +308,7 @@ public: } void emitAssignment(MCSymbol *S, const MCExpr *Value) override { - auto *Symbol = cast<MCSymbolELF>(S); + auto *Symbol = static_cast<MCSymbolELF *>(S); // When encoding an assignment to set symbol A to symbol B, also copy // the st_other bits encoding the local entry point offset. @@ -335,7 +335,7 @@ private: auto *Ref = dyn_cast<const MCSymbolRefExpr>(S); if (!Ref) return false; - const auto &RhsSym = cast<MCSymbolELF>(Ref->getSymbol()); + auto &RhsSym = static_cast<const MCSymbolELF &>(Ref->getSymbol()); unsigned Other = D->getOther(); Other &= ~ELF::STO_PPC64_LOCAL_MASK; Other |= RhsSym.getOther() & ELF::STO_PPC64_LOCAL_MASK; diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index ce1d51a..2ab2c14 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -2155,7 +2155,8 @@ void PPCLinuxAsmPrinter::emitFunctionBodyStart() { PPCTargetStreamer *TS = static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer()); - TS->emitLocalEntry(cast<MCSymbolELF>(CurrentFnSym), LocalOffsetExp); + TS->emitLocalEntry(static_cast<MCSymbolELF *>(CurrentFnSym), + LocalOffsetExp); } else if (Subtarget->isUsingPCRelativeCalls()) { // When generating the entry point for a function we have a few scenarios // based on whether or not that function uses R2 and whether or not that @@ -2182,7 +2183,7 @@ void PPCLinuxAsmPrinter::emitFunctionBodyStart() { MF->hasInlineAsm() || (!PPCFI->usesTOCBasePtr() && UsesX2OrR2)) { PPCTargetStreamer *TS = static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer()); - TS->emitLocalEntry(cast<MCSymbolELF>(CurrentFnSym), + TS->emitLocalEntry(static_cast<MCSymbolELF *>(CurrentFnSym), MCConstantExpr::create(1, OutContext)); } } @@ -2766,7 +2767,7 @@ void PPCAIXAsmPrinter::emitGlobalVariableHelper(const GlobalVariable *GV) { if (GV->hasComdat()) report_fatal_error("COMDAT not yet supported by AIX."); - MCSymbolXCOFF *GVSym = cast<MCSymbolXCOFF>(getSymbol(GV)); + auto *GVSym = static_cast<MCSymbolXCOFF *>(getSymbol(GV)); if (GV->isDeclarationForLinker()) { emitLinkage(GV, GVSym); @@ -2859,7 +2860,7 @@ void PPCAIXAsmPrinter::emitFunctionDescriptor() { MCSectionSubPair Current = OutStreamer->getCurrentSection(); // Emit function descriptor. OutStreamer->switchSection( - cast<MCSymbolXCOFF>(CurrentFnDescSym)->getRepresentedCsect()); + static_cast<MCSymbolXCOFF *>(CurrentFnDescSym)->getRepresentedCsect()); // Emit aliasing label for function descriptor csect. for (const GlobalAlias *Alias : GOAliasMap[&MF->getFunction()]) @@ -2994,7 +2995,8 @@ void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) { SmallString<128> Name; StringRef Prefix = "."; Name += Prefix; - Name += cast<MCSymbolXCOFF>(I.first.first)->getSymbolTableName(); + Name += static_cast<const MCSymbolXCOFF *>(I.first.first) + ->getSymbolTableName(); MCSymbol *S = OutContext.getOrCreateSymbol(Name); TCEntry = static_cast<MCSectionXCOFF *>( getObjFileLowering().getSectionForTOCEntry(S, TM)); @@ -3112,7 +3114,7 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) { setCsectAlignment(&G); std::optional<CodeModel::Model> OptionalCodeModel = G.getCodeModel(); if (OptionalCodeModel) - setOptionalCodeModel(cast<MCSymbolXCOFF>(getSymbol(&G)), + setOptionalCodeModel(static_cast<MCSymbolXCOFF *>(getSymbol(&G)), *OptionalCodeModel); } @@ -3139,7 +3141,7 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) { if (GVar) { std::optional<CodeModel::Model> OptionalCodeModel = GVar->getCodeModel(); if (OptionalCodeModel) - setOptionalCodeModel(cast<MCSymbolXCOFF>(getSymbol(&Alias)), + setOptionalCodeModel(static_cast<MCSymbolXCOFF *>(getSymbol(&Alias)), *OptionalCodeModel); } @@ -3190,8 +3192,8 @@ void PPCAIXAsmPrinter::emitInstruction(const MachineInstr *MI) { case PPC::BL_NOP: { const MachineOperand &MO = MI->getOperand(0); if (MO.isSymbol()) { - MCSymbolXCOFF *S = - cast<MCSymbolXCOFF>(OutContext.getOrCreateSymbol(MO.getSymbolName())); + auto *S = static_cast<MCSymbolXCOFF *>( + OutContext.getOrCreateSymbol(MO.getSymbolName())); ExtSymSDNodeSymbols.insert(S); } } break; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index f179873..30b5fd6 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1433,7 +1433,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // With 32 condition bits, we don't need to sink (and duplicate) compares // aggressively in CodeGenPrep. if (Subtarget.useCRBits()) { - setHasMultipleConditionRegisters(); setJumpIsExpensive(); } @@ -5540,8 +5539,8 @@ static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG, const auto getAIXFuncEntryPointSymbolSDNode = [&](const GlobalValue *GV) { const TargetMachine &TM = Subtarget.getTargetMachine(); const TargetLoweringObjectFile *TLOF = TM.getObjFileLowering(); - MCSymbolXCOFF *S = - cast<MCSymbolXCOFF>(TLOF->getFunctionEntryPointSymbol(GV, TM)); + auto *S = + static_cast<MCSymbolXCOFF *>(TLOF->getFunctionEntryPointSymbol(GV, TM)); MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); return DAG.getMCSymbol(S, PtrVT); @@ -19856,3 +19855,7 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic( return Builder.CreateOr( Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64"); } + +bool PPCTargetLowering::hasMultipleConditionRegisters(EVT VT) const { + return Subtarget.useCRBits(); +} diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 124c711..9755f0e 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1207,6 +1207,8 @@ namespace llvm { bool IsVarArg) const; bool supportsTailCallFor(const CallBase *CB) const; + bool hasMultipleConditionRegisters(EVT VT) const override; + private: struct ReuseLoadInfo { SDValue Ptr; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index 9538b20..95ec42f 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -327,19 +327,19 @@ bool RISCVAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) { bool RISCVAsmBackend::relaxDwarfLineAddr(MCFragment &F, bool &WasRelaxed) const { - MCContext &C = getContext(); - int64_t LineDelta = F.getDwarfLineDelta(); const MCExpr &AddrDelta = F.getDwarfAddrDelta(); - SmallVector<MCFixup, 1> Fixups; size_t OldSize = F.getVarSize(); int64_t Value; + // If the label difference can be resolved, use the default handling, which + // utilizes a shorter special opcode. + if (AddrDelta.evaluateAsAbsolute(Value, *Asm)) + return false; [[maybe_unused]] bool IsAbsolute = AddrDelta.evaluateKnownAbsolute(Value, *Asm); assert(IsAbsolute && "CFA with invalid expression"); - Fixups.clear(); SmallVector<char> Data; raw_svector_ostream OS(Data); @@ -349,33 +349,21 @@ bool RISCVAsmBackend::relaxDwarfLineAddr(MCFragment &F, encodeSLEB128(LineDelta, OS); } - unsigned Offset; - std::pair<MCFixupKind, MCFixupKind> Fixup; - // According to the DWARF specification, the `DW_LNS_fixed_advance_pc` opcode // takes a single unsigned half (unencoded) operand. The maximum encodable // value is therefore 65535. Set a conservative upper bound for relaxation. + unsigned PCBytes; if (Value > 60000) { - unsigned PtrSize = C.getAsmInfo()->getCodePointerSize(); - - OS << uint8_t(dwarf::DW_LNS_extended_op); - encodeULEB128(PtrSize + 1, OS); - - OS << uint8_t(dwarf::DW_LNE_set_address); - Offset = OS.tell(); - assert((PtrSize == 4 || PtrSize == 8) && "Unexpected pointer size"); - Fixup = RISCV::getRelocPairForSize(PtrSize); - OS.write_zeros(PtrSize); + PCBytes = getContext().getAsmInfo()->getCodePointerSize(); + OS << uint8_t(dwarf::DW_LNS_extended_op) << uint8_t(PCBytes + 1) + << uint8_t(dwarf::DW_LNE_set_address); + OS.write_zeros(PCBytes); } else { + PCBytes = 2; OS << uint8_t(dwarf::DW_LNS_fixed_advance_pc); - Offset = OS.tell(); - Fixup = RISCV::getRelocPairForSize(2); support::endian::write<uint16_t>(OS, 0, llvm::endianness::little); } - - const MCBinaryExpr &MBE = cast<MCBinaryExpr>(AddrDelta); - Fixups.push_back(MCFixup::create(Offset, MBE.getLHS(), std::get<0>(Fixup))); - Fixups.push_back(MCFixup::create(Offset, MBE.getRHS(), std::get<1>(Fixup))); + auto Offset = OS.tell() - PCBytes; if (LineDelta == INT64_MAX) { OS << uint8_t(dwarf::DW_LNS_extended_op); @@ -386,7 +374,8 @@ bool RISCVAsmBackend::relaxDwarfLineAddr(MCFragment &F, } F.setVarContents(Data); - F.setVarFixups(Fixups); + F.setVarFixups({MCFixup::create(Offset, &AddrDelta, + MCFixup::getDataKindForSize(PCBytes))}); WasRelaxed = OldSize != Data.size(); return true; } @@ -754,7 +743,7 @@ std::optional<bool> RISCVAsmBackend::evaluateFixup(const MCFragment &, if (!AUIPCTarget.getAddSym()) return false; - const MCSymbolELF &SA = cast<MCSymbolELF>(*AUIPCTarget.getAddSym()); + auto &SA = static_cast<const MCSymbolELF &>(*AUIPCTarget.getAddSym()); if (SA.isUndefined()) return false; @@ -881,9 +870,8 @@ bool RISCVAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup, } void RISCVAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { IsResolved = addReloc(F, Fixup, Target, Value, IsResolved); MCFixupKind Kind = Fixup.getKind(); if (mc::isRelocation(Kind)) @@ -898,15 +886,14 @@ void RISCVAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Shift the value into position. Value <<= Info.TargetOffset; - unsigned Offset = Fixup.getOffset(); unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8; - - assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. for (unsigned i = 0; i != NumBytes; ++i) { - Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); + Data[i] |= uint8_t((Value >> (i * 8)) & 0xff); } } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h index d97d632..adec1ec 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h @@ -46,8 +46,7 @@ public: void maybeAddVendorReloc(const MCFragment &, const MCFixup &); void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp index 9bf7896..2885e3c 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp @@ -55,8 +55,8 @@ unsigned RISCVELFObjectWriter::getRelocType(const MCFixup &Fixup, case ELF::R_RISCV_TLS_GOT_HI20: case ELF::R_RISCV_TLS_GD_HI20: case ELF::R_RISCV_TLSDESC_HI20: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; case ELF::R_RISCV_PLT32: case ELF::R_RISCV_GOT32_PCREL: diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp index c654fd2b..543c4c5 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp @@ -117,7 +117,7 @@ void RISCVTargetELFStreamer::reset() { void RISCVTargetELFStreamer::emitDirectiveVariantCC(MCSymbol &Symbol) { getStreamer().getAssembler().registerSymbol(Symbol); - cast<MCSymbolELF>(Symbol).setOther(ELF::STO_RISCV_VARIANT_CC); + static_cast<MCSymbolELF &>(Symbol).setOther(ELF::STO_RISCV_VARIANT_CC); } void RISCVELFStreamer::reset() { @@ -142,7 +142,8 @@ void RISCVELFStreamer::emitInstructionsMappingSymbol() { } void RISCVELFStreamer::emitMappingSymbol(StringRef Name) { - auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name)); + auto *Symbol = + static_cast<MCSymbolELF *>(getContext().createLocalSymbol(Name)); emitLabel(Symbol); Symbol->setType(ELF::STT_NOTYPE); Symbol->setBinding(ELF::STB_LOCAL); diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h index f816561c..98c8738 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h @@ -68,27 +68,6 @@ enum Fixups { fixup_riscv_invalid, NumTargetFixupKinds = fixup_riscv_invalid - FirstTargetFixupKind }; - -static inline std::pair<MCFixupKind, MCFixupKind> -getRelocPairForSize(unsigned Size) { - switch (Size) { - default: - llvm_unreachable("unsupported fixup size"); - case 1: - return std::make_pair(FirstLiteralRelocationKind + ELF::R_RISCV_ADD8, - FirstLiteralRelocationKind + ELF::R_RISCV_SUB8); - case 2: - return std::make_pair(FirstLiteralRelocationKind + ELF::R_RISCV_ADD16, - FirstLiteralRelocationKind + ELF::R_RISCV_SUB16); - case 4: - return std::make_pair(FirstLiteralRelocationKind + ELF::R_RISCV_ADD32, - FirstLiteralRelocationKind + ELF::R_RISCV_SUB32); - case 8: - return std::make_pair(FirstLiteralRelocationKind + ELF::R_RISCV_ADD64, - FirstLiteralRelocationKind + ELF::R_RISCV_SUB64); - } -} - } // end namespace llvm::RISCV #endif diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp index 3655861..f70837e 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp @@ -68,36 +68,30 @@ void RISCVTargetStreamer::emitNoteGnuPropertySection( const Triple &Triple = Ctx.getTargetTriple(); Align NoteAlign; + uint64_t DescSize; if (Triple.isArch64Bit()) { NoteAlign = Align(8); + DescSize = 16; } else { assert(Triple.isArch32Bit()); NoteAlign = Align(4); + DescSize = 12; } assert(Ctx.getObjectFileType() == MCContext::Environment::IsELF); MCSection *const NoteSection = Ctx.getELFSection(".note.gnu.property", ELF::SHT_NOTE, ELF::SHF_ALLOC); - NoteSection->setAlignment(NoteAlign); OutStreamer.pushSection(); OutStreamer.switchSection(NoteSection); // Emit the note header - OutStreamer.emitIntValue(4, 4); // n_namsz - - MCSymbol *const NDescBeginSym = Ctx.createTempSymbol(); - MCSymbol *const NDescEndSym = Ctx.createTempSymbol(); - const MCExpr *const NDescSzExpr = - MCBinaryExpr::createSub(MCSymbolRefExpr::create(NDescEndSym, Ctx), - MCSymbolRefExpr::create(NDescBeginSym, Ctx), Ctx); - - OutStreamer.emitValue(NDescSzExpr, 4); // n_descsz + OutStreamer.emitValueToAlignment(NoteAlign); + OutStreamer.emitIntValue(4, 4); // n_namsz + OutStreamer.emitIntValue(DescSize, 4); // n_descsz OutStreamer.emitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4); // n_type OutStreamer.emitBytes(StringRef("GNU", 4)); // n_name // Emit n_desc field - OutStreamer.emitLabel(NDescBeginSym); - OutStreamer.emitValueToAlignment(NoteAlign); // Emit the feature_1_and property OutStreamer.emitIntValue(ELF::GNU_PROPERTY_RISCV_FEATURE_1_AND, 4); // pr_type @@ -105,7 +99,6 @@ void RISCVTargetStreamer::emitNoteGnuPropertySection( OutStreamer.emitIntValue(Feature1And, 4); // pr_data OutStreamer.emitValueToAlignment(NoteAlign); // pr_padding - OutStreamer.emitLabel(NDescEndSym); OutStreamer.popSection(); } diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp index 82c0d8d..80a48c5 100644 --- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp @@ -167,9 +167,8 @@ static std::pair<Value *, Value *> matchStridedStart(Value *Start, default: llvm_unreachable("Unexpected opcode"); case Instruction::Or: - // TODO: We'd be better off creating disjoint or here, but we don't yet - // have an IRBuilder API for that. - [[fallthrough]]; + Start = Builder.CreateOr(Start, Splat, "", /*IsDisjoint=*/true); + break; case Instruction::Add: Start = Builder.CreateAdd(Start, Splat); break; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index adbfbeb..0077ecf 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -927,6 +927,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, {ISD::VP_LOAD, ISD::VP_STORE, ISD::EXPERIMENTAL_VP_STRIDED_LOAD, ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER, ISD::VP_SCATTER}, VT, Custom); + setOperationAction(ISD::VP_LOAD_FF, VT, Custom); setOperationAction({ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR, ISD::SCALAR_TO_VECTOR}, @@ -1105,6 +1106,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, {ISD::VP_LOAD, ISD::VP_STORE, ISD::EXPERIMENTAL_VP_STRIDED_LOAD, ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER, ISD::VP_SCATTER}, VT, Custom); + setOperationAction(ISD::VP_LOAD_FF, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Expand); @@ -1181,6 +1183,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER, ISD::VP_SCATTER}, VT, Custom); + setOperationAction(ISD::VP_LOAD_FF, VT, Custom); setOperationAction(ISD::FNEG, VT, Expand); setOperationAction(ISD::FABS, VT, Expand); @@ -1352,6 +1355,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER, ISD::VP_SCATTER}, VT, Custom); + setOperationAction(ISD::VP_LOAD_FF, VT, Custom); setOperationAction({ISD::ADD, ISD::MUL, ISD::SUB, ISD::AND, ISD::OR, ISD::XOR, ISD::SDIV, ISD::SREM, ISD::UDIV, @@ -1442,6 +1446,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::VP_SCATTER, ISD::EXPERIMENTAL_VP_STRIDED_LOAD, ISD::EXPERIMENTAL_VP_STRIDED_STORE}, VT, Custom); + setOperationAction(ISD::VP_LOAD_FF, VT, Custom); setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom); setOperationAction({ISD::STRICT_FP_ROUND, ISD::STRICT_FP_EXTEND}, VT, @@ -7012,6 +7017,7 @@ static unsigned getRISCVVLOp(SDValue Op) { OP_CASE(FDIV) OP_CASE(FNEG) OP_CASE(FABS) + OP_CASE(FCOPYSIGN) OP_CASE(FSQRT) OP_CASE(SMIN) OP_CASE(SMAX) @@ -7079,6 +7085,15 @@ static unsigned getRISCVVLOp(SDValue Op) { if (Op.getSimpleValueType().getVectorElementType() == MVT::i1) return RISCVISD::VMXOR_VL; return RISCVISD::XOR_VL; + case ISD::ANY_EXTEND: + case ISD::ZERO_EXTEND: + return RISCVISD::VZEXT_VL; + case ISD::SIGN_EXTEND: + return RISCVISD::VSEXT_VL; + case ISD::SETCC: + return RISCVISD::SETCC_VL; + case ISD::VSELECT: + return RISCVISD::VMERGE_VL; case ISD::VP_SELECT: case ISD::VP_MERGE: return RISCVISD::VMERGE_VL; @@ -7419,12 +7434,16 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, if (Op.getOperand(0).getValueType().isVector() && Op.getOperand(0).getValueType().getVectorElementType() == MVT::i1) return lowerVectorMaskExt(Op, DAG, /*ExtVal*/ 1); - return lowerFixedLengthVectorExtendToRVV(Op, DAG, RISCVISD::VZEXT_VL); + if (Op.getValueType().isScalableVector()) + return Op; + return lowerToScalableOp(Op, DAG); case ISD::SIGN_EXTEND: if (Op.getOperand(0).getValueType().isVector() && Op.getOperand(0).getValueType().getVectorElementType() == MVT::i1) return lowerVectorMaskExt(Op, DAG, /*ExtVal*/ -1); - return lowerFixedLengthVectorExtendToRVV(Op, DAG, RISCVISD::VSEXT_VL); + if (Op.getValueType().isScalableVector()) + return Op; + return lowerToScalableOp(Op, DAG); case ISD::SPLAT_VECTOR_PARTS: return lowerSPLAT_VECTOR_PARTS(Op, DAG); case ISD::INSERT_VECTOR_ELT: @@ -8103,6 +8122,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::MLOAD: case ISD::VP_LOAD: return lowerMaskedLoad(Op, DAG); + case ISD::VP_LOAD_FF: + return lowerLoadFF(Op, DAG); case ISD::MSTORE: case ISD::VP_STORE: return lowerMaskedStore(Op, DAG); @@ -8166,7 +8187,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, if (isPromotedOpNeedingSplit(Op.getOperand(0), Subtarget)) return SplitVectorOp(Op, DAG); - return lowerFixedLengthVectorSetccToRVV(Op, DAG); + return lowerToScalableOp(Op, DAG); } case ISD::ADD: case ISD::SUB: @@ -8182,6 +8203,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::UREM: case ISD::BSWAP: case ISD::CTPOP: + case ISD::VSELECT: return lowerToScalableOp(Op, DAG); case ISD::SHL: case ISD::SRA: @@ -8250,14 +8272,12 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return lowerToScalableOp(Op, DAG); assert(Op.getOpcode() != ISD::CTTZ); return lowerCTLZ_CTTZ_ZERO_UNDEF(Op, DAG); - case ISD::VSELECT: - return lowerFixedLengthVectorSelectToRVV(Op, DAG); case ISD::FCOPYSIGN: if (Op.getValueType() == MVT::f16 || Op.getValueType() == MVT::bf16) return lowerFCOPYSIGN(Op, DAG, Subtarget); if (isPromotedOpNeedingSplit(Op, Subtarget)) return SplitVectorOp(Op, DAG); - return lowerFixedLengthVectorFCOPYSIGNToRVV(Op, DAG); + return lowerToScalableOp(Op, DAG); case ISD::STRICT_FADD: case ISD::STRICT_FSUB: case ISD::STRICT_FMUL: @@ -9694,33 +9714,6 @@ SDValue RISCVTargetLowering::lowerVectorMaskExt(SDValue Op, SelectionDAG &DAG, return convertFromScalableVector(VecVT, Select, DAG, Subtarget); } -SDValue RISCVTargetLowering::lowerFixedLengthVectorExtendToRVV( - SDValue Op, SelectionDAG &DAG, unsigned ExtendOpc) const { - MVT ExtVT = Op.getSimpleValueType(); - // Only custom-lower extensions from fixed-length vector types. - if (!ExtVT.isFixedLengthVector()) - return Op; - MVT VT = Op.getOperand(0).getSimpleValueType(); - // Grab the canonical container type for the extended type. Infer the smaller - // type from that to ensure the same number of vector elements, as we know - // the LMUL will be sufficient to hold the smaller type. - MVT ContainerExtVT = getContainerForFixedLengthVector(ExtVT); - // Get the extended container type manually to ensure the same number of - // vector elements between source and dest. - MVT ContainerVT = MVT::getVectorVT(VT.getVectorElementType(), - ContainerExtVT.getVectorElementCount()); - - SDValue Op1 = - convertToScalableVector(ContainerVT, Op.getOperand(0), DAG, Subtarget); - - SDLoc DL(Op); - auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget); - - SDValue Ext = DAG.getNode(ExtendOpc, DL, ContainerExtVT, Op1, Mask, VL); - - return convertFromScalableVector(ExtVT, Ext, DAG, Subtarget); -} - // Custom-lower truncations from vectors to mask vectors by using a mask and a // setcc operation: // (vXi1 = trunc vXiN vec) -> (vXi1 = setcc (and vec, 1), 0, ne) @@ -12739,6 +12732,51 @@ SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op, return DAG.getMergeValues({Result, Chain}, DL); } +SDValue RISCVTargetLowering::lowerLoadFF(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + MVT VT = Op->getSimpleValueType(0); + + const auto *VPLoadFF = cast<VPLoadFFSDNode>(Op); + EVT MemVT = VPLoadFF->getMemoryVT(); + MachineMemOperand *MMO = VPLoadFF->getMemOperand(); + SDValue Chain = VPLoadFF->getChain(); + SDValue BasePtr = VPLoadFF->getBasePtr(); + + SDValue Mask = VPLoadFF->getMask(); + SDValue VL = VPLoadFF->getVectorLength(); + + MVT XLenVT = Subtarget.getXLenVT(); + + MVT ContainerVT = VT; + if (VT.isFixedLengthVector()) { + ContainerVT = getContainerForFixedLengthVector(VT); + MVT MaskVT = getMaskTypeFor(ContainerVT); + Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget); + } + + unsigned IntID = Intrinsic::riscv_vleff_mask; + SDValue Ops[] = { + Chain, + DAG.getTargetConstant(IntID, DL, XLenVT), + DAG.getUNDEF(ContainerVT), + BasePtr, + Mask, + VL, + DAG.getTargetConstant(RISCVVType::TAIL_AGNOSTIC, DL, XLenVT)}; + + SDVTList VTs = DAG.getVTList({ContainerVT, Op->getValueType(1), MVT::Other}); + + SDValue Result = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, MemVT, MMO); + SDValue OutVL = Result.getValue(1); + Chain = Result.getValue(2); + + if (VT.isFixedLengthVector()) + Result = convertFromScalableVector(VT, Result, DAG, Subtarget); + + return DAG.getMergeValues({Result, OutVL, Chain}, DL); +} + SDValue RISCVTargetLowering::lowerMaskedStore(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); @@ -12834,31 +12872,6 @@ SDValue RISCVTargetLowering::lowerVectorCompress(SDValue Op, return Res; } -SDValue -RISCVTargetLowering::lowerFixedLengthVectorSetccToRVV(SDValue Op, - SelectionDAG &DAG) const { - MVT InVT = Op.getOperand(0).getSimpleValueType(); - MVT ContainerVT = getContainerForFixedLengthVector(InVT); - - MVT VT = Op.getSimpleValueType(); - - SDValue Op1 = - convertToScalableVector(ContainerVT, Op.getOperand(0), DAG, Subtarget); - SDValue Op2 = - convertToScalableVector(ContainerVT, Op.getOperand(1), DAG, Subtarget); - - SDLoc DL(Op); - auto [Mask, VL] = getDefaultVLOps(VT.getVectorNumElements(), ContainerVT, DL, - DAG, Subtarget); - MVT MaskVT = getMaskTypeFor(ContainerVT); - - SDValue Cmp = - DAG.getNode(RISCVISD::SETCC_VL, DL, MaskVT, - {Op1, Op2, Op.getOperand(2), DAG.getUNDEF(MaskVT), Mask, VL}); - - return convertFromScalableVector(VT, Cmp, DAG, Subtarget); -} - SDValue RISCVTargetLowering::lowerVectorStrictFSetcc(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); @@ -12985,51 +12998,6 @@ SDValue RISCVTargetLowering::lowerABS(SDValue Op, SelectionDAG &DAG) const { return Max; } -SDValue RISCVTargetLowering::lowerFixedLengthVectorFCOPYSIGNToRVV( - SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - MVT VT = Op.getSimpleValueType(); - SDValue Mag = Op.getOperand(0); - SDValue Sign = Op.getOperand(1); - assert(Mag.getValueType() == Sign.getValueType() && - "Can only handle COPYSIGN with matching types."); - - MVT ContainerVT = getContainerForFixedLengthVector(VT); - Mag = convertToScalableVector(ContainerVT, Mag, DAG, Subtarget); - Sign = convertToScalableVector(ContainerVT, Sign, DAG, Subtarget); - - auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget); - - SDValue CopySign = DAG.getNode(RISCVISD::FCOPYSIGN_VL, DL, ContainerVT, Mag, - Sign, DAG.getUNDEF(ContainerVT), Mask, VL); - - return convertFromScalableVector(VT, CopySign, DAG, Subtarget); -} - -SDValue RISCVTargetLowering::lowerFixedLengthVectorSelectToRVV( - SDValue Op, SelectionDAG &DAG) const { - MVT VT = Op.getSimpleValueType(); - MVT ContainerVT = getContainerForFixedLengthVector(VT); - - MVT I1ContainerVT = - MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount()); - - SDValue CC = - convertToScalableVector(I1ContainerVT, Op.getOperand(0), DAG, Subtarget); - SDValue Op1 = - convertToScalableVector(ContainerVT, Op.getOperand(1), DAG, Subtarget); - SDValue Op2 = - convertToScalableVector(ContainerVT, Op.getOperand(2), DAG, Subtarget); - - SDLoc DL(Op); - SDValue VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second; - - SDValue Select = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, CC, Op1, - Op2, DAG.getUNDEF(ContainerVT), VL); - - return convertFromScalableVector(VT, Select, DAG, Subtarget); -} - SDValue RISCVTargetLowering::lowerToScalableOp(SDValue Op, SelectionDAG &DAG) const { const auto &TSInfo = @@ -13056,7 +13024,9 @@ SDValue RISCVTargetLowering::lowerToScalableOp(SDValue Op, // "cast" fixed length vector to a scalable vector. assert(useRVVForFixedLengthVectorVT(V.getSimpleValueType()) && "Only fixed length vectors are supported!"); - Ops.push_back(convertToScalableVector(ContainerVT, V, DAG, Subtarget)); + MVT VContainerVT = ContainerVT.changeVectorElementType( + V.getSimpleValueType().getVectorElementType()); + Ops.push_back(convertToScalableVector(VContainerVT, V, DAG, Subtarget)); } SDLoc DL(Op); @@ -21478,11 +21448,10 @@ bool RISCVTargetLowering::canCreateUndefOrPoisonForTargetNode( // TODO: Add more target nodes. switch (Op.getOpcode()) { case RISCVISD::SELECT_CC: - // Integer select_cc cannot create poison. - // TODO: What are the FP poison semantics? - // TODO: This instruction blocks poison from the unselected operand, can - // we do anything with that? - return !Op.getValueType().isInteger(); + // Integer comparisons cannot create poison. + assert(Op.getOperand(0).getValueType().isInteger() && + "RISCVISD::SELECT_CC only compares integers"); + return false; } return TargetLowering::canCreateUndefOrPoisonForTargetNode( Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth); @@ -22550,6 +22519,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments( constexpr StringLiteral SupportedInterruptKinds[] = { "machine", "supervisor", + "rnmi", "qci-nest", "qci-nonest", "SiFive-CLIC-preemptible", @@ -22567,6 +22537,8 @@ SDValue RISCVTargetLowering::LowerFormalArguments( reportFatalUsageError( "'SiFive-CLIC-*' interrupt kinds require XSfmclic extension"); + if (Kind == "rnmi" && !Subtarget.hasStdExtSmrnmi()) + reportFatalUsageError("'rnmi' interrupt kind requires Srnmi extension"); const TargetFrameLowering *TFI = Subtarget.getFrameLowering(); if (Kind.starts_with("SiFive-CLIC-preemptible") && TFI->hasFP(MF)) reportFatalUsageError("'SiFive-CLIC-preemptible' interrupt kinds cannot " @@ -23212,7 +23184,11 @@ RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, if (Kind == "supervisor") RetOpc = RISCVISD::SRET_GLUE; - else if (Kind == "qci-nest" || Kind == "qci-nonest") { + else if (Kind == "rnmi") { + assert(STI.hasFeature(RISCV::FeatureStdExtSmrnmi) && + "Need Smrnmi extension for rnmi"); + RetOpc = RISCVISD::MNRET_GLUE; + } else if (Kind == "qci-nest" || Kind == "qci-nonest") { assert(STI.hasFeature(RISCV::FeatureVendorXqciint) && "Need Xqciint for qci-(no)nest"); RetOpc = RISCVISD::QC_C_MILEAVERET_GLUE; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index ca70c46..433b8be 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -526,6 +526,7 @@ private: SDValue lowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerABS(SDValue Op, SelectionDAG &DAG) const; SDValue lowerMaskedLoad(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerLoadFF(SDValue Op, SelectionDAG &DAG) const; SDValue lowerMaskedStore(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVectorCompress(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorFCOPYSIGNToRVV(SDValue Op, @@ -534,9 +535,6 @@ private: SDValue lowerMaskedScatter(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorLoadToRVV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorStoreToRVV(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerFixedLengthVectorSetccToRVV(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerFixedLengthVectorSelectToRVV(SDValue Op, - SelectionDAG &DAG) const; SDValue lowerToScalableOp(SDValue Op, SelectionDAG &DAG) const; SDValue LowerIS_FPCLASS(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVPOp(SDValue Op, SelectionDAG &DAG) const; @@ -551,8 +549,6 @@ private: SDValue lowerVPStridedLoad(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVPStridedStore(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVPCttzElements(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerFixedLengthVectorExtendToRVV(SDValue Op, SelectionDAG &DAG, - unsigned ExtendOpc) const; SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 6536078..8bd3830 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -75,6 +75,8 @@ def riscv_sret_glue : RVSDNode<"SRET_GLUE", SDTNone, [SDNPHasChain, SDNPOptInGlue]>; def riscv_mret_glue : RVSDNode<"MRET_GLUE", SDTNone, [SDNPHasChain, SDNPOptInGlue]>; +def riscv_mnret_glue : RVSDNode<"MNRET_GLUE", SDTNone, + [SDNPHasChain, SDNPOptInGlue]>; def riscv_mileaveret_glue : RVSDNode<"QC_C_MILEAVERET_GLUE", SDTNone, [SDNPHasChain, SDNPOptInGlue]>; @@ -935,7 +937,6 @@ def MRET : Priv<"mret", 0b0011000>, Sched<[]> { let rs1 = 0; let rs2 = 0b00010; } -} // isBarrier = 1, isReturn = 1, isTerminator = 1 let Predicates = [HasStdExtSmrnmi] in { def MNRET : Priv<"mnret", 0b0111000>, Sched<[]> { @@ -944,6 +945,8 @@ def MNRET : Priv<"mnret", 0b0111000>, Sched<[]> { let rs2 = 0b00010; } }// Predicates = [HasStdExtSmrnmi] +} // isBarrier = 1, isReturn = 1, isTerminator = 1 + def WFI : Priv<"wfi", 0b0001000>, Sched<[]> { let rd = 0; @@ -1801,6 +1804,8 @@ def : Pat<(riscv_call texternalsym:$func), (PseudoCALL texternalsym:$func)>; def : Pat<(riscv_sret_glue), (SRET)>; def : Pat<(riscv_mret_glue), (MRET)>; +let Predicates = [HasStdExtSmrnmi] in +def : Pat<(riscv_mnret_glue), (MNRET)>; let isCall = 1, Defs = [X1] in { let Predicates = [NoStdExtZicfilp] in diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 5265613..2c64b0c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -14,6 +14,14 @@ // Operand and SDNode transformation definitions. //===----------------------------------------------------------------------===// +def SDT_SetMultiple : SDTypeProfile<0, 4, [SDTCisSameAs<0, 1>, + SDTCisSameAs<1, 3>, + SDTCisPtrTy<2>, + SDTCisVT<3, XLenVT>]>; + +def qc_setwmi : RVSDNode<"QC_SETWMI", SDT_SetMultiple, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + def uimm5nonzero : RISCVOp<XLenVT>, ImmLeaf<XLenVT, [{return (Imm != 0) && isUInt<5>(Imm);}]> { let ParserMatchClass = UImmAsmOperand<5, "NonZero">; @@ -27,6 +35,8 @@ def uimm5nonzero : RISCVOp<XLenVT>, }]; } +def tuimm5nonzero : TImmLeaf<XLenVT, [{return (Imm != 0) && isUInt<5>(Imm);}]>; + def uimm5gt3 : RISCVOp<XLenVT>, ImmLeaf<XLenVT, [{return (Imm > 3) && isUInt<5>(Imm);}]> { let ParserMatchClass = UImmAsmOperand<5, "GT3">; @@ -92,6 +102,8 @@ def uimm5slist : RISCVOp<XLenVT>, ImmLeaf<XLenVT, }]; } +def tuimm7_lsb00 : TImmLeaf<XLenVT,[{return isShiftedUInt<5, 2>(Imm);}]>; + def uimm10 : RISCVUImmLeafOp<10>; def uimm11 : RISCVUImmLeafOp<11>; @@ -457,6 +469,13 @@ class QCIRVInstRR<bits<5> funct5, DAGOperand InTyRs1, string opcodestr> : RVInstR<{0b00, funct5}, 0b011, OPC_CUSTOM_0, (outs GPRNoX0:$rd), (ins InTyRs1:$rs1, GPRNoX0:$rs2), opcodestr, "$rd, $rs1, $rs2">; +class QCIRVInstRRTied<bits<5> funct5, DAGOperand InTyRs1, string opcodestr> + : RVInstR<{0b00, funct5}, 0b011, OPC_CUSTOM_0, (outs GPRNoX0:$rd_wb), + (ins GPRNoX0:$rd, InTyRs1:$rs1, GPRNoX0:$rs2), opcodestr, + "$rd, $rs1, $rs2"> { + let Constraints = "$rd = $rd_wb"; +} + class QCIBitManipRII<bits<3> funct3, bits<2> funct2, DAGOperand InTyRs1, string opcodestr> : RVInstIBase<funct3, OPC_CUSTOM_0, (outs GPRNoX0:$rd), @@ -470,11 +489,26 @@ class QCIBitManipRII<bits<3> funct3, bits<2> funct2, let Inst{24-20} = shamt; } +class QCIBitManipRIITied<bits<3> funct3, bits<2> funct2, + DAGOperand InTyRs1, string opcodestr> + : RVInstIBase<funct3, OPC_CUSTOM_0, (outs GPRNoX0:$rd_wb), (ins GPRNoX0:$rd, + InTyRs1:$rs1, uimm5_plus1:$width, uimm5:$shamt), + opcodestr, "$rd, $rs1, $width, $shamt"> { + let Constraints = "$rd = $rd_wb"; + bits<5> shamt; + bits<5> width; + + let Inst{31-30} = funct2; + let Inst{29-25} = width; + let Inst{24-20} = shamt; +} + class QCIRVInstRI<bits<1> funct1, DAGOperand InTyImm11, string opcodestr> - : RVInstIBase<0b000, OPC_CUSTOM_0, (outs GPRNoX0:$rd), - (ins GPRNoX0:$rs1, InTyImm11:$imm11), opcodestr, + : RVInstIBase<0b000, OPC_CUSTOM_0, (outs GPRNoX0:$rd_wb), + (ins GPRNoX0:$rd, GPRNoX0:$rs1, InTyImm11:$imm11), opcodestr, "$rd, $rs1, $imm11"> { + let Constraints = "$rd = $rd_wb"; bits<11> imm11; let Inst{31-31} = funct1; @@ -858,12 +892,12 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { let Inst{29-25} = width; let Inst{24-20} = shamt; } - def QC_INSB : QCIBitManipRII<0b001, 0b01, GPR, "qc.insb">; - def QC_INSBH : QCIBitManipRII<0b001, 0b10, GPR, "qc.insbh">; - def QC_INSBR : QCIRVInstRR<0b00000, GPR, "qc.insbr">; - def QC_INSBHR : QCIRVInstRR<0b00001, GPR, "qc.insbhr">; - def QC_INSBPR : QCIRVInstRR<0b00010, GPR, "qc.insbpr">; - def QC_INSBPRH : QCIRVInstRR<0b00011, GPR, "qc.insbprh">; + def QC_INSB : QCIBitManipRIITied<0b001, 0b01, GPR, "qc.insb">; + def QC_INSBH : QCIBitManipRIITied<0b001, 0b10, GPR, "qc.insbh">; + def QC_INSBR : QCIRVInstRRTied<0b00000, GPR, "qc.insbr">; + def QC_INSBHR : QCIRVInstRRTied<0b00001, GPR, "qc.insbhr">; + def QC_INSBPR : QCIRVInstRRTied<0b00010, GPR, "qc.insbpr">; + def QC_INSBPRH : QCIRVInstRRTied<0b00011, GPR, "qc.insbprh">; def QC_EXTU : QCIBitManipRII<0b010, 0b00, GPRNoX0, "qc.extu">; def QC_EXTDU : QCIBitManipRII<0b010, 0b10, GPRNoX31, "qc.extdu">; def QC_EXTDUR : QCIRVInstRR<0b00100, GPRNoX31, "qc.extdur">; @@ -1566,6 +1600,11 @@ def : QCISELECTIICCPat <SETEQ, QC_SELECTIIEQ>; def : QCISELECTIICCPat <SETNE, QC_SELECTIINE>; } // Predicates = [HasVendorXqcics, IsRV32] +let Predicates = [HasVendorXqcilsm, IsRV32] in { +def : Pat<(qc_setwmi GPR:$rs3, GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7), + (QC_SETWMI GPR:$rs3, GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7)>; +} // Predicates = [HasVendorXqcilsm, IsRV32] + //===----------------------------------------------------------------------===/i // Compress Instruction tablegen backend. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index d2a6514..04ffb05 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -641,13 +641,15 @@ def : Pat<(binop_allhusers<or> (shl GPR:$rs2, (XLenVT 8)), let Predicates = [HasStdExtZbkb, IsRV32] in { def : Pat<(i32 (or (zexti16 (i32 GPR:$rs1)), (shl GPR:$rs2, (i32 16)))), (PACK GPR:$rs1, GPR:$rs2)>; -def : Pat<(or (or - (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 24)), + +// Match a pattern of 2 bytes being inserted into bits [31:16], with bits +// bits [15:0] coming from a zero extended value. We can use pack with packh for +// bits [31:16]. If bits [15:0] can also be a packh, it can be matched +// separately. +def : Pat<(or (or (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 24)), (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))), - (or - (shl (zexti8 (XLenVT GPR:$op0rs2)), (XLenVT 8)), - (zexti8 (XLenVT GPR:$op0rs1)))), - (PACK (XLenVT (PACKH GPR:$op0rs1, GPR:$op0rs2)), + (zexti16 (XLenVT GPR:$rs1))), + (PACK (XLenVT GPR:$rs1), (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td index a250ac8..5a5a9ed 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td @@ -206,8 +206,6 @@ let Predicates = [HasStdExtZvksh], RVVConstraint = VS2Constraint in { //===----------------------------------------------------------------------===// defvar I32IntegerVectors = !filter(vti, AllIntegerVectors, !eq(vti.SEW, 32)); -defvar I32I64IntegerVectors = !filter(vti, AllIntegerVectors, - !or(!eq(vti.SEW, 32), !eq(vti.SEW, 64))); class ZvkI32IntegerVectors<string vd_lmul> { list<VTypeInfo> vs2_types = !cond(!eq(vd_lmul, "M8") : !filter(vti, I32IntegerVectors, !le(vti.LMul.octuple, 32)), @@ -1126,16 +1124,16 @@ let Predicates = [HasStdExtZvkned] in { defm : VPatUnaryV_S_NoMaskVectorCrypto<"int_riscv_vaesz", "PseudoVAESZ", I32IntegerVectors>; } // Predicates = [HasStdExtZvkned] -let Predicates = [HasStdExtZvknha] in { +let Predicates = [HasStdExtZvknhaOrZvknhb] in { defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ch", "PseudoVSHA2CH", I32IntegerVectors>; - defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CH", I32IntegerVectors>; + defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CL", I32IntegerVectors>; defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ms", "PseudoVSHA2MS", I32IntegerVectors, isSEWAware=true>; } // Predicates = [HasStdExtZvknha] let Predicates = [HasStdExtZvknhb] in { - defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ch", "PseudoVSHA2CH", I32I64IntegerVectors>; - defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CH", I32I64IntegerVectors>; - defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ms", "PseudoVSHA2MS", I32I64IntegerVectors, isSEWAware=true>; + defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ch", "PseudoVSHA2CH", I64IntegerVectors>; + defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CL", I64IntegerVectors>; + defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ms", "PseudoVSHA2MS", I64IntegerVectors, isSEWAware=true>; } // Predicates = [HasStdExtZvknhb] let Predicates = [HasStdExtZvksed] in { diff --git a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp index 6ecddad..041dd07 100644 --- a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "RISCVSelectionDAGInfo.h" +#include "RISCVSubtarget.h" +#include "llvm/CodeGen/SelectionDAG.h" #define GET_SDNODE_DESC #include "RISCVGenSDNodeInfo.inc" @@ -62,3 +64,94 @@ void RISCVSelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG, } #endif } + +SDValue RISCVSelectionDAGInfo::EmitTargetCodeForMemset( + SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo) const { + const auto &Subtarget = DAG.getSubtarget<RISCVSubtarget>(); + // We currently do this only for Xqcilsm + if (!Subtarget.hasVendorXqcilsm()) + return SDValue(); + + // Do this only if we know the size at compile time. + ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); + if (!ConstantSize) + return SDValue(); + + uint64_t NumberOfBytesToWrite = ConstantSize->getZExtValue(); + + // Do this only if it is word aligned and we write a multiple of 4 bytes. + if (!(Alignment >= 4) || !((NumberOfBytesToWrite & 3) == 0)) + return SDValue(); + + SmallVector<SDValue, 8> OutChains; + SDValue SrcValueReplicated = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src); + int NumberOfWords = NumberOfBytesToWrite / 4; + MachineFunction &MF = DAG.getMachineFunction(); + auto Volatile = + isVolatile ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone; + + // Helper for constructing the QC_SETWMI instruction + auto getSetwmiNode = [&](uint8_t SizeWords, uint8_t OffsetSetwmi) -> SDValue { + SDValue Ops[] = {Chain, SrcValueReplicated, Dst, + DAG.getTargetConstant(SizeWords, dl, MVT::i32), + DAG.getTargetConstant(OffsetSetwmi, dl, MVT::i32)}; + MachineMemOperand *BaseMemOperand = MF.getMachineMemOperand( + DstPtrInfo.getWithOffset(OffsetSetwmi), + MachineMemOperand::MOStore | Volatile, SizeWords * 4, Align(4)); + return DAG.getMemIntrinsicNode(RISCVISD::QC_SETWMI, dl, + DAG.getVTList(MVT::Other), Ops, MVT::i32, + BaseMemOperand); + }; + + // If i8 type and constant non-zero value. + if ((Src.getValueType() == MVT::i8) && !isNullConstant(Src)) + // Replicate byte to word by multiplication with 0x01010101. + SrcValueReplicated = + DAG.getNode(ISD::MUL, dl, MVT::i32, SrcValueReplicated, + DAG.getConstant(0x01010101ul, dl, MVT::i32)); + + // We limit a QC_SETWMI to 16 words or less to improve interruptibility. + // So for 1-16 words we use a single QC_SETWMI: + // + // QC_SETWMI reg1, N, 0(reg2) + // + // For 17-32 words we use two QC_SETWMI's with the first as 16 words and the + // second for the remainder: + // + // QC_SETWMI reg1, 16, 0(reg2) + // QC_SETWMI reg1, N, 64(reg2) + // + // For 33-48 words, we would like to use (16, 16, n), but that means the last + // QC_SETWMI needs an offset of 128 which the instruction doesn't support. + // So in this case we use a length of 15 for the second instruction and we do + // the rest with the third instruction. + // This means the maximum inlined number of words is 47 (for now): + // + // QC_SETWMI R2, R0, 16, 0 + // QC_SETWMI R2, R0, 15, 64 + // QC_SETWMI R2, R0, N, 124 + // + // For 48 words or more, call the target independent memset + if (NumberOfWords >= 48) + return SDValue(); + + if (NumberOfWords <= 16) { + // 1 - 16 words + return getSetwmiNode(NumberOfWords, 0); + } + + if (NumberOfWords <= 32) { + // 17 - 32 words + OutChains.push_back(getSetwmiNode(NumberOfWords - 16, 64)); + OutChains.push_back(getSetwmiNode(16, 0)); + } else { + // 33 - 47 words + OutChains.push_back(getSetwmiNode(NumberOfWords - 31, 124)); + OutChains.push_back(getSetwmiNode(15, 64)); + OutChains.push_back(getSetwmiNode(16, 0)); + } + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); +} diff --git a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.h b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.h index 641189f..08c8d11 100644 --- a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.h +++ b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.h @@ -34,6 +34,12 @@ public: void verifyTargetNode(const SelectionDAG &DAG, const SDNode *N) const override; + SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl, + SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, Align Alignment, + bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo) const override; + bool hasPassthruOp(unsigned Opcode) const { return GenNodeInfo.getDesc(Opcode).TSFlags & RISCVISD::HasPassthruOpMask; } diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index da6ac2f..3f2a83f 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -642,12 +642,6 @@ void RISCVTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { OptimizationLevel Level) { LPM.addPass(LoopIdiomVectorizePass(LoopIdiomVectorizeStyle::Predicated)); }); - - PB.registerVectorizerEndEPCallback( - [](FunctionPassManager &FPM, OptimizationLevel Level) { - if (Level.isOptimizingForSpeed()) - FPM.addPass(createFunctionToLoopPassAdaptor(EVLIndVarSimplifyPass())); - }); } yaml::MachineFunctionInfo * diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 0d5eb86..67f924a 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -979,11 +979,11 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) const { - // The interleaved memory access pass will lower interleaved memory ops (i.e - // a load and store followed by a specific shuffle) to vlseg/vsseg - // intrinsics. - if (!UseMaskForCond && !UseMaskForGaps && - Factor <= TLI->getMaxSupportedInterleaveFactor()) { + // The interleaved memory access pass will lower (de)interleave ops combined + // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg + // only support masking per-iteration (i.e. condition), not per-segment (i.e. + // gap). + if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { auto *VTy = cast<VectorType>(VecTy); std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy); // Need to make sure type has't been scalarized diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index d62d99c..05d504c 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -398,6 +398,10 @@ public: bool enableInterleavedAccessVectorization() const override { return true; } + bool enableMaskedInterleavedAccessVectorization() const override { + return ST->hasVInstructions(); + } + unsigned getMinTripCountTailFoldingThreshold() const override; enum RISCVRegisterClass { GPRRC, FPRRC, VRRC }; diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index c946451..37a71e8 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -69,6 +69,7 @@ struct OperandInfo { // Represent as 1,2,4,8, ... and fractional indicator. This is because // EMUL can take on values that don't map to RISCVVType::VLMUL values exactly. // For example, a mask operand can have an EMUL less than MF8. + // If nullopt, then EMUL isn't used (i.e. only a single scalar is read). std::optional<std::pair<unsigned, bool>> EMUL; unsigned Log2EEW; @@ -83,12 +84,14 @@ struct OperandInfo { OperandInfo() = delete; - static bool EMULAndEEWAreEqual(const OperandInfo &A, const OperandInfo &B) { - return A.Log2EEW == B.Log2EEW && A.EMUL == B.EMUL; - } - - static bool EEWAreEqual(const OperandInfo &A, const OperandInfo &B) { - return A.Log2EEW == B.Log2EEW; + /// Return true if the EMUL and EEW produced by \p Def is compatible with the + /// EMUL and EEW used by \p User. + static bool areCompatible(const OperandInfo &Def, const OperandInfo &User) { + if (Def.Log2EEW != User.Log2EEW) + return false; + if (User.EMUL && Def.EMUL != User.EMUL) + return false; + return true; } void print(raw_ostream &OS) const { @@ -98,7 +101,7 @@ struct OperandInfo { OS << "f"; OS << EMUL->first; } else - OS << "EMUL: unknown\n"; + OS << "EMUL: none\n"; OS << ", EEW: " << (1 << Log2EEW); } }; @@ -1399,13 +1402,7 @@ RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const { return std::nullopt; } - // If the operand is used as a scalar operand, then the EEW must be - // compatible. Otherwise, the EMUL *and* EEW must be compatible. - bool IsVectorOpUsedAsScalarOp = isVectorOpUsedAsScalarOp(UserOp); - if ((IsVectorOpUsedAsScalarOp && - !OperandInfo::EEWAreEqual(*ConsumerInfo, *ProducerInfo)) || - (!IsVectorOpUsedAsScalarOp && - !OperandInfo::EMULAndEEWAreEqual(*ConsumerInfo, *ProducerInfo))) { + if (!OperandInfo::areCompatible(*ProducerInfo, *ConsumerInfo)) { LLVM_DEBUG( dbgs() << " Abort due to incompatible information for EMUL or EEW.\n"); diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp index ef84d43..5710cf2 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp @@ -21,8 +21,7 @@ public: SPIRVAsmBackend(llvm::endianness Endian) : MCAsmBackend(Endian) {} void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override {} + uint8_t *Data, uint64_t Value, bool IsResolved) override {} std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override { diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp index a7f6fbc..64d301e 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp @@ -375,7 +375,7 @@ void SPIRVInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, if (Op.isReg()) O << '%' << (getIDFromRegister(Op.getReg().id()) + 1); else if (Op.isImm()) - O << formatImm((int64_t)Op.getImm()); + O << formatImm(Op.getImm()); else if (Op.isDFPImm()) O << formatImm((double)Op.getDFPImm()); else if (Op.isExpr()) diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index 947b574..2c3e087 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -21,7 +21,9 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicsSPIRV.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/TypedPointerType.h" +#include "llvm/Transforms/Utils/Local.h" #include <queue> #include <unordered_set> @@ -187,6 +189,8 @@ class SPIRVEmitIntrinsics void applyDemangledPtrArgTypes(IRBuilder<> &B); + GetElementPtrInst *simplifyZeroLengthArrayGepInst(GetElementPtrInst *GEP); + bool runOnFunction(Function &F); bool postprocessTypes(Module &M); bool processFunctionPointers(Module &M); @@ -1458,6 +1462,24 @@ static void createSaturatedConversionDecoration(Instruction *I, createDecorationIntrinsic(I, SaturatedConversionNode, B); } +static void addSaturatedDecorationToIntrinsic(Instruction *I, IRBuilder<> &B) { + if (auto *CI = dyn_cast<CallInst>(I)) { + if (Function *Fu = CI->getCalledFunction()) { + if (Fu->isIntrinsic()) { + unsigned const int IntrinsicId = Fu->getIntrinsicID(); + switch (IntrinsicId) { + case Intrinsic::fptosi_sat: + case Intrinsic::fptoui_sat: + createSaturatedConversionDecoration(I, B); + break; + default: + break; + } + } + } + } +} + Instruction *SPIRVEmitIntrinsics::visitCallInst(CallInst &Call) { if (!Call.isInlineAsm()) return &Call; @@ -2543,6 +2565,30 @@ void SPIRVEmitIntrinsics::applyDemangledPtrArgTypes(IRBuilder<> &B) { } } +GetElementPtrInst * +SPIRVEmitIntrinsics::simplifyZeroLengthArrayGepInst(GetElementPtrInst *GEP) { + // getelementptr [0 x T], P, 0 (zero), I -> getelementptr T, P, I. + // If type is 0-length array and first index is 0 (zero), drop both the + // 0-length array type and the first index. This is a common pattern in the + // IR, e.g. when using a zero-length array as a placeholder for a flexible + // array such as unbound arrays. + assert(GEP && "GEP is null"); + Type *SrcTy = GEP->getSourceElementType(); + SmallVector<Value *, 8> Indices(GEP->indices()); + ArrayType *ArrTy = dyn_cast<ArrayType>(SrcTy); + if (ArrTy && ArrTy->getNumElements() == 0 && + PatternMatch::match(Indices[0], PatternMatch::m_Zero())) { + IRBuilder<> Builder(GEP); + Indices.erase(Indices.begin()); + SrcTy = ArrTy->getElementType(); + Value *NewGEP = Builder.CreateGEP(SrcTy, GEP->getPointerOperand(), Indices, + "", GEP->getNoWrapFlags()); + assert(llvm::isa<GetElementPtrInst>(NewGEP) && "NewGEP should be a GEP"); + return cast<GetElementPtrInst>(NewGEP); + } + return nullptr; +} + bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) { if (Func.isDeclaration()) return false; @@ -2560,14 +2606,30 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) { AggrConstTypes.clear(); AggrStores.clear(); - // fix GEP result types ahead of inference + // Fix GEP result types ahead of inference, and simplify if possible. + // Data structure for dead instructions that were simplified and replaced. + SmallPtrSet<Instruction *, 4> DeadInsts; for (auto &I : instructions(Func)) { auto *Ref = dyn_cast<GetElementPtrInst>(&I); if (!Ref || GR->findDeducedElementType(Ref)) continue; + + GetElementPtrInst *NewGEP = simplifyZeroLengthArrayGepInst(Ref); + if (NewGEP) { + Ref->replaceAllUsesWith(NewGEP); + if (isInstructionTriviallyDead(Ref)) + DeadInsts.insert(Ref); + + Ref = NewGEP; + } if (Type *GepTy = getGEPType(Ref)) GR->addDeducedElementType(Ref, normalizeType(GepTy)); } + // Remove dead instructions that were simplified and replaced. + for (auto *I : DeadInsts) { + assert(I->use_empty() && "Dead instruction should not have any uses left"); + I->eraseFromParent(); + } processParamTypesByFunHeader(CurrF, B); @@ -2640,6 +2702,7 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) { if (isConvergenceIntrinsic(I)) continue; + addSaturatedDecorationToIntrinsic(I, B); processInstrAfterVisit(I, B); } diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index 83fccdc..f1436d5 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -828,6 +828,8 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeArray(uint32_t NumElems, "Invalid array element type"); SPIRVType *SpvTypeInt32 = getOrCreateSPIRVIntegerType(32, MIRBuilder); SPIRVType *ArrayType = nullptr; + const SPIRVSubtarget &ST = + cast<SPIRVSubtarget>(MIRBuilder.getMF().getSubtarget()); if (NumElems != 0) { Register NumElementsVReg = buildConstantInt(NumElems, MIRBuilder, SpvTypeInt32, EmitIR); @@ -838,6 +840,10 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeArray(uint32_t NumElems, .addUse(NumElementsVReg); }); } else { + assert(ST.isShader() && "Runtime arrays are not allowed in non-shader " + "SPIR-V modules."); + if (!ST.isShader()) + return nullptr; ArrayType = createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) { return MIRBuilder.buildInstr(SPIRV::OpTypeRuntimeArray) .addDef(createTypeVReg(MIRBuilder)) diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index d4fa62a..e9f5ffa 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -665,6 +665,11 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg, case TargetOpcode::G_FPTOUI: return selectUnOp(ResVReg, ResType, I, SPIRV::OpConvertFToU); + case TargetOpcode::G_FPTOSI_SAT: + return selectUnOp(ResVReg, ResType, I, SPIRV::OpConvertFToS); + case TargetOpcode::G_FPTOUI_SAT: + return selectUnOp(ResVReg, ResType, I, SPIRV::OpConvertFToU); + case TargetOpcode::G_SITOFP: return selectIToF(ResVReg, ResType, I, true, SPIRV::OpConvertSToF); case TargetOpcode::G_UITOFP: diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp index 1995e0f..170bddd 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp @@ -203,6 +203,10 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { .legalForCartesianProduct(allIntScalarsAndVectors, allFloatScalarsAndVectors); + getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT}) + .legalForCartesianProduct(allIntScalarsAndVectors, + allFloatScalarsAndVectors); + getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) .legalForCartesianProduct(allFloatScalarsAndVectors, allScalarsAndVectors); diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index 0cd9d78..ab06fc0 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -744,8 +744,14 @@ void SPIRV::RequirementHandler::checkSatisfiable( IsSatisfiable = false; } + AvoidCapabilitiesSet AvoidCaps; + if (!ST.isShader()) + AvoidCaps.S.insert(SPIRV::Capability::Shader); + else + AvoidCaps.S.insert(SPIRV::Capability::Kernel); + for (auto Cap : MinimalCaps) { - if (AvailableCaps.contains(Cap)) + if (AvailableCaps.contains(Cap) && !AvoidCaps.S.contains(Cap)) continue; LLVM_DEBUG(dbgs() << "Capability not supported: " << getSymbolicOperandMnemonic( @@ -1865,6 +1871,11 @@ void addInstrRequirements(const MachineInstr &MI, Reqs.addCapability(SPIRV::Capability::TernaryBitwiseFunctionINTEL); break; } + case SPIRV::OpCopyMemorySized: { + Reqs.addCapability(SPIRV::Capability::Addresses); + // TODO: Add UntypedPointersKHR when implemented. + break; + } default: break; diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp index ba023af..bc60842 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp @@ -127,8 +127,7 @@ public: std::optional<MCFixupKind> getFixupKind(StringRef Name) const override; MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool writeNopData(raw_ostream &OS, uint64_t Count, const MCSubtargetInfo *STI) const override { @@ -253,21 +252,19 @@ MCFixupKindInfo SparcAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { } void SparcAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { maybeAddReloc(F, Fixup, Target, Value, IsResolved); if (!IsResolved) return; Value = adjustFixupValue(Fixup.getKind(), Value); unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); - unsigned Offset = Fixup.getOffset(); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. for (unsigned i = 0; i != NumBytes; ++i) { unsigned Idx = Endian == llvm::endianness::little ? i : (NumBytes - 1) - i; - Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff); + Data[Idx] |= uint8_t((Value >> (i * 8)) & 0xff); } } diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp index a95c4ff..d2071c3 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp @@ -58,8 +58,8 @@ unsigned SparcELFObjectWriter::getRelocType(const MCFixup &Fixup, case ELF::R_SPARC_TLS_IE_ADD: case ELF::R_SPARC_TLS_LE_HIX22: case ELF::R_SPARC_TLS_LE_LOX10: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp index 8b5587a..1bca5c7 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp @@ -111,8 +111,8 @@ unsigned SystemZELFObjectWriter::getRelocType(const MCFixup &Fixup, case SystemZ::S_TLSLD: case SystemZ::S_TLSLDM: case SystemZ::S_DTPOFF: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp index b2cfd04..d692cbe 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp @@ -113,8 +113,7 @@ public: std::optional<MCFixupKind> getFixupKind(StringRef Name) const override; MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool writeNopData(raw_ostream &OS, uint64_t Count, const MCSubtargetInfo *STI) const override; }; @@ -152,20 +151,18 @@ MCFixupKindInfo SystemZMCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { } void SystemZMCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (Target.getSpecifier()) IsResolved = false; maybeAddReloc(F, Fixup, Target, Value, IsResolved); MCFixupKind Kind = Fixup.getKind(); if (mc::isRelocation(Kind)) return; - unsigned Offset = Fixup.getOffset(); unsigned BitSize = getFixupKindInfo(Kind).TargetSize; unsigned Size = (BitSize + 7) / 8; - assert(Offset + Size <= F.getSize() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + Size <= F.getSize() && "Invalid fixup offset!"); // Big-endian insertion of Size bytes. Value = extractBitsForFixup(Kind, Value, Fixup, getContext()); @@ -173,7 +170,7 @@ void SystemZMCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, Value &= ((uint64_t)1 << BitSize) - 1; unsigned ShiftValue = (Size * 8) - 8; for (unsigned I = 0; I != Size; ++I) { - Data[Offset + I] |= uint8_t(Value >> ShiftValue); + Data[I] |= uint8_t(Value >> ShiftValue); ShiftValue -= 8; } } diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp index ae6ca55a36..783f86a 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -1286,7 +1286,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl( if ((Opcode == SystemZ::ALFI && OpNum == 0 && isInt<8>((int32_t)MI.getOperand(2).getImm())) || (Opcode == SystemZ::ALGFI && OpNum == 0 && - isInt<8>((int64_t)MI.getOperand(2).getImm()))) { + isInt<8>(MI.getOperand(2).getImm()))) { // AL(G)FI %reg, CONST -> AL(G)SI %mem, CONST Opcode = (Opcode == SystemZ::ALFI ? SystemZ::ALSI : SystemZ::ALGSI); MachineInstr *BuiltMI = @@ -1301,7 +1301,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl( if ((Opcode == SystemZ::SLFI && OpNum == 0 && isInt<8>((int32_t)-MI.getOperand(2).getImm())) || (Opcode == SystemZ::SLGFI && OpNum == 0 && - isInt<8>((int64_t)-MI.getOperand(2).getImm()))) { + isInt<8>((-MI.getOperand(2).getImm())))) { // SL(G)FI %reg, CONST -> AL(G)SI %mem, -CONST Opcode = (Opcode == SystemZ::SLFI ? SystemZ::ALSI : SystemZ::ALGSI); MachineInstr *BuiltMI = diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp index b02b6af..c1b9d9f 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp +++ b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp @@ -112,8 +112,7 @@ public: } void applyFixup(const MCFragment &, const MCFixup &, const MCValue &, - MutableArrayRef<char>, uint64_t Value, - bool IsResolved) override; + uint8_t *, uint64_t Value, bool IsResolved) override; bool mayNeedRelaxation(unsigned Opcode, ArrayRef<MCOperand> Operands, const MCSubtargetInfo &STI) const override { @@ -152,7 +151,7 @@ public: } // end anonymous namespace void VEAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, MutableArrayRef<char> Data, + const MCValue &Target, uint8_t *Data, uint64_t Value, bool IsResolved) { switch (Fixup.getKind()) { case VE::fixup_ve_tls_gd_hi32: @@ -173,14 +172,14 @@ void VEAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, Value <<= Info.TargetOffset; unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); - unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the bits // from the fixup value. The Value has been "split up" into the // appropriate bitfields above. for (unsigned i = 0; i != NumBytes; ++i) { unsigned Idx = Endian == llvm::endianness::little ? i : (NumBytes - 1) - i; - Data[Offset + Idx] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff); + Data[Idx] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff); } } diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp index 41f31eb..c702064 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp +++ b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp @@ -44,8 +44,8 @@ unsigned VEELFObjectWriter::getRelocType(const MCFixup &Fixup, case VE::S_TLS_GD_LO32: case VE::S_TPOFF_HI32: case VE::S_TPOFF_LO32: - if (auto *SA = Target.getAddSym()) - cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS); + if (auto *SA = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp index 6ae69a4..80df4ed 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp @@ -212,12 +212,12 @@ static wasm::WasmLimits defaultLimits() { static MCSymbolWasm *getOrCreateFunctionTableSymbol(MCContext &Ctx, const StringRef &Name, bool Is64) { - MCSymbolWasm *Sym = cast_or_null<MCSymbolWasm>(Ctx.lookupSymbol(Name)); + auto *Sym = static_cast<MCSymbolWasm *>(Ctx.lookupSymbol(Name)); if (Sym) { if (!Sym->isFunctionTable()) Ctx.reportError(SMLoc(), "symbol is not a wasm funcref table"); } else { - Sym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(Name)); + Sym = static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(Name)); Sym->setFunctionTable(Is64); // The default function table is synthesized by the linker. Sym->setUndefined(); @@ -703,7 +703,7 @@ public: ExpectBlockType = false; // The "true" here will cause this to be a nameless symbol. MCSymbol *Sym = Ctx.createTempSymbol("typeindex", true); - auto *WasmSym = cast<MCSymbolWasm>(Sym); + auto *WasmSym = static_cast<MCSymbolWasm *>(Sym); WasmSym->setSignature(Signature); WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); const MCExpr *Expr = @@ -949,7 +949,8 @@ public: return error("Unknown type in .globaltype modifier: ", TypeTok); } // Now set this symbol with the correct type. - auto *WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName)); + auto *WasmSym = + static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(SymName)); WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL); WasmSym->setGlobalType(wasm::WasmGlobalType{uint8_t(*Type), Mutable}); // And emit the directive again. @@ -980,7 +981,8 @@ public: // Now that we have the name and table type, we can actually create the // symbol - auto *WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName)); + auto *WasmSym = + static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(SymName)); WasmSym->setType(wasm::WASM_SYMBOL_TYPE_TABLE); if (Is64) { Limits.Flags |= wasm::WASM_LIMITS_FLAG_IS_64; @@ -1000,7 +1002,8 @@ public: auto SymName = expectIdent(); if (SymName.empty()) return ParseStatus::Failure; - auto *WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName)); + auto *WasmSym = + static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(SymName)); if (WasmSym->isDefined()) { // We push 'Function' either when a label is parsed or a .functype // directive is parsed. The reason it is not easy to do this uniformly @@ -1042,7 +1045,8 @@ public: auto ExportName = expectIdent(); if (ExportName.empty()) return ParseStatus::Failure; - auto *WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName)); + auto *WasmSym = + static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(SymName)); WasmSym->setExportName(Ctx.allocateString(ExportName)); TOut.emitExportName(WasmSym, ExportName); return expect(AsmToken::EndOfStatement, "EOL"); @@ -1057,7 +1061,8 @@ public: auto ImportModule = expectIdent(); if (ImportModule.empty()) return ParseStatus::Failure; - auto *WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName)); + auto *WasmSym = + static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(SymName)); WasmSym->setImportModule(Ctx.allocateString(ImportModule)); TOut.emitImportModule(WasmSym, ImportModule); return expect(AsmToken::EndOfStatement, "EOL"); @@ -1072,7 +1077,8 @@ public: auto ImportName = expectIdent(); if (ImportName.empty()) return ParseStatus::Failure; - auto *WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName)); + auto *WasmSym = + static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(SymName)); WasmSym->setImportName(Ctx.allocateString(ImportName)); TOut.emitImportName(WasmSym, ImportName); return expect(AsmToken::EndOfStatement, "EOL"); @@ -1082,7 +1088,8 @@ public: auto SymName = expectIdent(); if (SymName.empty()) return ParseStatus::Failure; - auto *WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName)); + auto *WasmSym = + static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(SymName)); auto *Signature = Ctx.createWasmSignature(); if (parseRegTypeList(Signature->Params)) return ParseStatus::Failure; @@ -1224,7 +1231,7 @@ public: if (!CWS->isText()) return; - auto *WasmSym = cast<MCSymbolWasm>(Symbol); + auto *WasmSym = static_cast<MCSymbolWasm *>(Symbol); // Unlike other targets, we don't allow data in text sections (labels // declared with .type @object). if (WasmSym->getType() == wasm::WASM_SYMBOL_TYPE_DATA) { diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp index 4a305ab..6943888 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp @@ -258,7 +258,7 @@ bool WebAssemblyAsmTypeCheck::getGlobal(SMLoc ErrorLoc, const MCSymbolRefExpr *SymRef; if (getSymRef(ErrorLoc, GlobalOp, SymRef)) return true; - const auto *WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol()); + auto *WasmSym = static_cast<const MCSymbolWasm *>(&SymRef->getSymbol()); switch (WasmSym->getType().value_or(wasm::WASM_SYMBOL_TYPE_DATA)) { case wasm::WASM_SYMBOL_TYPE_GLOBAL: Type = static_cast<wasm::ValType>(WasmSym->getGlobalType().Type); @@ -286,7 +286,7 @@ bool WebAssemblyAsmTypeCheck::getTable(SMLoc ErrorLoc, const MCOperand &TableOp, const MCSymbolRefExpr *SymRef; if (getSymRef(ErrorLoc, TableOp, SymRef)) return true; - const auto *WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol()); + auto *WasmSym = static_cast<const MCSymbolWasm *>(&SymRef->getSymbol()); if (WasmSym->getType().value_or(wasm::WASM_SYMBOL_TYPE_DATA) != wasm::WASM_SYMBOL_TYPE_TABLE) return typeError(ErrorLoc, StringRef("symbol ") + WasmSym->getName() + @@ -302,7 +302,7 @@ bool WebAssemblyAsmTypeCheck::getSignature(SMLoc ErrorLoc, const MCSymbolRefExpr *SymRef = nullptr; if (getSymRef(ErrorLoc, SigOp, SymRef)) return true; - const auto *WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol()); + auto *WasmSym = static_cast<const MCSymbolWasm *>(&SymRef->getSymbol()); Sig = WasmSym->getSignature(); if (!Sig || WasmSym->getType() != Type) { diff --git a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp index 0f7b27b..2a398d4 100644 --- a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp +++ b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp @@ -237,7 +237,7 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction( } else { // We don't have access to the signature, so create a symbol without one MCSymbol *Sym = getContext().createTempSymbol("typeindex", true); - auto *WasmSym = cast<MCSymbolWasm>(Sym); + auto *WasmSym = static_cast<MCSymbolWasm *>(Sym); WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); const MCExpr *Expr = MCSymbolRefExpr::create( WasmSym, WebAssembly::S_TYPEINDEX, getContext()); diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp index 84eb15f..eecef31 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp @@ -39,7 +39,7 @@ public: MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, bool) override; + uint8_t *Data, uint64_t Value, bool) override; std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override; @@ -80,8 +80,7 @@ bool WebAssemblyAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count, void WebAssemblyAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, + const MCValue &Target, uint8_t *Data, uint64_t Value, bool IsResolved) { if (!IsResolved) Asm->getWriter().recordRelocation(F, Fixup, Target, Value); @@ -96,13 +95,13 @@ void WebAssemblyAsmBackend::applyFixup(const MCFragment &F, // Shift the value into position. Value <<= Info.TargetOffset; - unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= F.getSize() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. for (unsigned I = 0; I != NumBytes; ++I) - Data[Offset + I] |= uint8_t((Value >> (I * 8)) & 0xff); + Data[I] |= uint8_t((Value >> (I * 8)) & 0xff); } std::unique_ptr<MCObjectTargetWriter> diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp index 2e97215..d8bfed9 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp @@ -380,7 +380,7 @@ void WebAssemblyInstPrinter::printWebAssemblySignatureOperand(const MCInst *MI, O << WebAssembly::anyTypeToString(Imm); } else { auto Expr = cast<MCSymbolRefExpr>(Op.getExpr()); - auto *Sym = cast<MCSymbolWasm>(&Expr->getSymbol()); + auto *Sym = static_cast<const MCSymbolWasm *>(&Expr->getSymbol()); if (Sym->getSignature()) { O << WebAssembly::signatureToString(Sym->getSignature()); } else { @@ -398,10 +398,10 @@ void WebAssemblyInstPrinter::printCatchList(const MCInst *MI, unsigned OpNo, auto PrintTagOp = [&](const MCOperand &Op) { const MCSymbolRefExpr *TagExpr = nullptr; - const MCSymbolWasm *TagSym = nullptr; + const MCSymbol *TagSym = nullptr; if (Op.isExpr()) { TagExpr = cast<MCSymbolRefExpr>(Op.getExpr()); - TagSym = cast<MCSymbolWasm>(&TagExpr->getSymbol()); + TagSym = &TagExpr->getSymbol(); O << TagSym->getName() << " "; } else { // When instructions are parsed from the disassembler, we have an diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp index cbaf10f..7096104 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp @@ -107,7 +107,7 @@ void WebAssemblyMCCodeEmitter::encodeInstruction( encodeULEB128(uint32_t(MO.getImm()), OS); break; case WebAssembly::OPERAND_I64IMM: - encodeSLEB128(int64_t(MO.getImm()), OS); + encodeSLEB128(MO.getImm(), OS); break; case WebAssembly::OPERAND_SIGNATURE: case WebAssembly::OPERAND_VEC_I8IMM: diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp index 2cf4bec..ffbc7e1 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp @@ -66,7 +66,7 @@ static const MCSection *getTargetSection(const MCExpr *Expr) { unsigned WebAssemblyWasmObjectWriter::getRelocType( const MCValue &Target, const MCFixup &Fixup, const MCSectionWasm &FixupSection, bool IsLocRel) const { - auto &SymA = cast<MCSymbolWasm>(*Target.getAddSym()); + auto &SymA = static_cast<const MCSymbolWasm &>(*Target.getAddSym()); auto Spec = WebAssembly::Specifier(Target.getSpecifier()); switch (Spec) { case WebAssembly::S_GOT: diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp index 1bf070e..db832bc 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp @@ -171,10 +171,10 @@ MCSymbolWasm *WebAssemblyAsmPrinter::getMCSymbolForFunction( WebAssembly::signatureToString(Sig); report_fatal_error(Twine(Msg)); } - WasmSym = cast<MCSymbolWasm>( + WasmSym = static_cast<MCSymbolWasm *>( GetExternalSymbolSymbol(getEmscriptenInvokeSymbolName(Sig))); } else { - WasmSym = cast<MCSymbolWasm>(getSymbol(F)); + WasmSym = static_cast<MCSymbolWasm *>(getSymbol(F)); } return WasmSym; } @@ -186,9 +186,7 @@ void WebAssemblyAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { } assert(!GV->isThreadLocal()); - - MCSymbolWasm *Sym = cast<MCSymbolWasm>(getSymbol(GV)); - + auto *Sym = static_cast<MCSymbolWasm *>(getSymbol(GV)); if (!Sym->getType()) { SmallVector<MVT, 1> VTs; Type *GlobalVT = GV->getValueType(); @@ -218,8 +216,7 @@ void WebAssemblyAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { } MCSymbol *WebAssemblyAsmPrinter::getOrCreateWasmSymbol(StringRef Name) { - auto *WasmSym = cast<MCSymbolWasm>(GetExternalSymbolSymbol(Name)); - + auto *WasmSym = static_cast<MCSymbolWasm *>(GetExternalSymbolSymbol(Name)); // May be called multiple times, so early out. if (WasmSym->getType()) return WasmSym; @@ -312,7 +309,7 @@ void WebAssemblyAsmPrinter::emitDecls(const Module &M) { // not be found here. MachineModuleInfoWasm &MMIW = MMI->getObjFileInfo<MachineModuleInfoWasm>(); for (StringRef Name : MMIW.MachineSymbolsUsed) { - auto *WasmSym = cast<MCSymbolWasm>(getOrCreateWasmSymbol(Name)); + auto *WasmSym = static_cast<MCSymbolWasm *>(getOrCreateWasmSymbol(Name)); if (WasmSym->isFunction()) { // TODO(wvo): is there any case where this overlaps with the call to // emitFunctionType in the loop below? @@ -324,7 +321,7 @@ void WebAssemblyAsmPrinter::emitDecls(const Module &M) { // Emit .globaltype, .tagtype, or .tabletype declarations for extern // declarations, i.e. those that have only been declared (but not defined) // in the current module - auto Sym = cast_or_null<MCSymbolWasm>(It.getValue().Symbol); + auto Sym = static_cast<MCSymbolWasm *>(It.getValue().Symbol); if (Sym && !Sym->isDefined()) emitSymbolType(Sym); } @@ -381,7 +378,7 @@ void WebAssemblyAsmPrinter::emitDecls(const Module &M) { } if (F.hasFnAttribute("wasm-export-name")) { - auto *Sym = cast<MCSymbolWasm>(getSymbol(&F)); + auto *Sym = static_cast<MCSymbolWasm *>(getSymbol(&F)); StringRef Name = F.getFnAttribute("wasm-export-name").getValueAsString(); Sym->setExportName(OutContext.allocateString(Name)); getTargetStreamer()->emitExportName(Sym, Name); @@ -581,7 +578,7 @@ void WebAssemblyAsmPrinter::EmitFunctionAttributes(Module &M) { auto *GV = cast<GlobalVariable>(CS->getOperand(1)->stripPointerCasts()); StringRef AnnotationString; getConstantStringInfo(GV, AnnotationString); - auto *Sym = cast<MCSymbolWasm>(getSymbol(F)); + auto *Sym = static_cast<MCSymbolWasm *>(getSymbol(F)); CustomSections[AnnotationString].push_back(Sym); } @@ -618,7 +615,7 @@ void WebAssemblyAsmPrinter::emitFunctionBodyStart() { computeSignatureVTs(F.getFunctionType(), &F, F, TM, ParamVTs, ResultVTs); auto Signature = signatureFromMVTs(OutContext, ResultVTs, ParamVTs); - auto *WasmSym = cast<MCSymbolWasm>(CurrentFnSym); + auto *WasmSym = static_cast<MCSymbolWasm *>(CurrentFnSym); WasmSym->setSignature(Signature); WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp index 4613fcb..e48283a 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp @@ -52,7 +52,7 @@ MCSymbol * WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const { const GlobalValue *Global = MO.getGlobal(); if (!isa<Function>(Global)) { - auto *WasmSym = cast<MCSymbolWasm>(Printer.getSymbol(Global)); + auto *WasmSym = static_cast<MCSymbolWasm *>(Printer.getSymbol(Global)); // If the symbol doesn't have an explicit WasmSymbolType yet and the // GlobalValue is actually a WebAssembly global, then ensure the symbol is a // WASM_SYMBOL_TYPE_GLOBAL. @@ -123,7 +123,7 @@ MCOperand WebAssemblyMCInstLower::lowerSymbolOperand(const MachineOperand &MO, const MCExpr *Expr = MCSymbolRefExpr::create(Sym, Spec, Ctx); if (MO.getOffset() != 0) { - const auto *WasmSym = cast<MCSymbolWasm>(Sym); + const auto *WasmSym = static_cast<const MCSymbolWasm *>(Sym); if (TargetFlags == WebAssemblyII::MO_GOT) report_fatal_error("GOT symbol references do not support offsets"); if (WasmSym->isFunction()) @@ -148,12 +148,12 @@ MCOperand WebAssemblyMCInstLower::lowerTypeIndexOperand( auto Signature = Ctx.createWasmSignature(); Signature->Returns = std::move(Returns); Signature->Params = std::move(Params); - MCSymbol *Sym = Printer.createTempSymbol("typeindex"); - auto *WasmSym = cast<MCSymbolWasm>(Sym); - WasmSym->setSignature(Signature); - WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); + auto *Sym = + static_cast<MCSymbolWasm *>(Printer.createTempSymbol("typeindex")); + Sym->setSignature(Signature); + Sym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); const MCExpr *Expr = - MCSymbolRefExpr::create(WasmSym, WebAssembly::S_TYPEINDEX, Ctx); + MCSymbolRefExpr::create(Sym, WebAssembly::S_TYPEINDEX, Ctx); return MCOperand::createExpr(Expr); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp index 747ef18..42d1271 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp @@ -104,13 +104,13 @@ const MachineOperand &WebAssembly::getCalleeOp(const MachineInstr &MI) { MCSymbolWasm *WebAssembly::getOrCreateFunctionTableSymbol( MCContext &Ctx, const WebAssemblySubtarget *Subtarget) { StringRef Name = "__indirect_function_table"; - MCSymbolWasm *Sym = cast_or_null<MCSymbolWasm>(Ctx.lookupSymbol(Name)); + auto *Sym = static_cast<MCSymbolWasm *>(Ctx.lookupSymbol(Name)); if (Sym) { if (!Sym->isFunctionTable()) Ctx.reportError(SMLoc(), "symbol is not a wasm funcref table"); } else { bool is64 = Subtarget && Subtarget->getTargetTriple().isArch64Bit(); - Sym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(Name)); + Sym = static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(Name)); Sym->setFunctionTable(is64); // The default function table is synthesized by the linker. Sym->setUndefined(); @@ -124,12 +124,12 @@ MCSymbolWasm *WebAssembly::getOrCreateFunctionTableSymbol( MCSymbolWasm *WebAssembly::getOrCreateFuncrefCallTableSymbol( MCContext &Ctx, const WebAssemblySubtarget *Subtarget) { StringRef Name = "__funcref_call_table"; - MCSymbolWasm *Sym = cast_or_null<MCSymbolWasm>(Ctx.lookupSymbol(Name)); + auto *Sym = static_cast<MCSymbolWasm *>(Ctx.lookupSymbol(Name)); if (Sym) { if (!Sym->isFunctionTable()) Ctx.reportError(SMLoc(), "symbol is not a wasm funcref table"); } else { - Sym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(Name)); + Sym = static_cast<MCSymbolWasm *>(Ctx.getOrCreateSymbol(Name)); // Setting Weak ensure only one table is left after linking when multiple // modules define the table. diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 1efef83..56a4cc3 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -174,8 +174,7 @@ public: std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &, uint64_t &) override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool mayNeedRelaxation(unsigned Opcode, ArrayRef<MCOperand> Operands, const MCSubtargetInfo &STI) const override; @@ -512,9 +511,8 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS, isFirstMacroFusibleInst(Inst, *MCII))) { // If we meet a unfused branch or the first instuction in a fusiable pair, // insert a BoundaryAlign fragment. - PendingBA = OS.getContext().allocFragment<MCBoundaryAlignFragment>( - AlignBoundary, STI); - OS.insert(PendingBA); + PendingBA = + OS.newSpecialFragment<MCBoundaryAlignFragment>(AlignBoundary, STI); } } @@ -676,9 +674,8 @@ std::optional<bool> X86AsmBackend::evaluateFixup(const MCFragment &, } void X86AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { // Force relocation when there is a specifier. This might be too conservative // - GAS doesn't emit a relocation for call local@plt; local:. if (Target.getSpecifier()) @@ -710,7 +707,7 @@ void X86AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, } for (unsigned i = 0; i != Size; ++i) - Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8)); + Data[i] = uint8_t(Value >> (i * 8)); } bool X86AsmBackend::mayNeedRelaxation(unsigned Opcode, diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index 3323b38..ea0abdd 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -349,8 +349,8 @@ unsigned X86ELFObjectWriter::getRelocType(const MCFixup &Fixup, case X86::S_TLSLDM: case X86::S_TPOFF: case X86::S_DTPOFF: - if (auto *S = Target.getAddSym()) - cast<MCSymbolELF>(S)->setType(ELF::STT_TLS); + if (auto *S = const_cast<MCSymbol *>(Target.getAddSym())) + static_cast<MCSymbolELF *>(S)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp index b8e117b..ff27005 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp @@ -369,7 +369,7 @@ void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, if (Op.isReg()) { printRegName(O, Op.getReg()); } else if (Op.isImm()) { - markup(O, Markup::Immediate) << formatImm((int64_t)Op.getImm()); + markup(O, Markup::Immediate) << formatImm(Op.getImm()); } else { assert(Op.isExpr() && "unknown operand kind in printOperand"); O << "offset "; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index bbbb1d9..f366094 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -8279,8 +8279,8 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, - unsigned &NumExtracts, - bool &IsSubAdd) { + unsigned &NumExtracts, bool &IsSubAdd, + bool &HasAllowContract) { using namespace SDPatternMatch; MVT VT = BV->getSimpleValueType(0); @@ -8292,6 +8292,7 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, SDValue InVec1 = DAG.getUNDEF(VT); NumExtracts = 0; + HasAllowContract = NumElts != 0; // Odd-numbered elements in the input build vector are obtained from // adding/subtracting two integer/float elements. @@ -8350,6 +8351,7 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, // Increment the number of extractions done. ++NumExtracts; + HasAllowContract &= Op->getFlags().hasAllowContract(); } // Ensure we have found an opcode for both parities and that they are @@ -8393,9 +8395,10 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit /// FMADDSUB is. static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, - SelectionDAG &DAG, - SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, - unsigned ExpectedUses) { + SelectionDAG &DAG, SDValue &Opnd0, + SDValue &Opnd1, SDValue &Opnd2, + unsigned ExpectedUses, + bool AllowSubAddOrAddSubContract) { if (Opnd0.getOpcode() != ISD::FMUL || !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA()) return false; @@ -8406,7 +8409,8 @@ static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, // or MUL + ADDSUB to FMADDSUB. const TargetOptions &Options = DAG.getTarget().Options; bool AllowFusion = - (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); + Options.AllowFPOpFusion == FPOpFusion::Fast || + (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract()); if (!AllowFusion) return false; @@ -8427,15 +8431,17 @@ static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, SDValue Opnd0, Opnd1; unsigned NumExtracts; bool IsSubAdd; - if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, - IsSubAdd)) + bool HasAllowContract; + if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd, + HasAllowContract)) return SDValue(); MVT VT = BV->getSimpleValueType(0); // Try to generate X86ISD::FMADDSUB node here. SDValue Opnd2; - if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) { + if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts, + HasAllowContract)) { unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB; return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2); } @@ -9132,11 +9138,17 @@ LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue SrcVec, IndicesVec; + + auto PeekThroughFreeze = [](SDValue N) { + if (N->getOpcode() == ISD::FREEZE && N.hasOneUse()) + return N->getOperand(0); + return N; + }; // Check for a match of the permute source vector and permute index elements. // This is done by checking that the i-th build_vector operand is of the form: // (extract_elt SrcVec, (extract_elt IndicesVec, i)). for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) { - SDValue Op = V.getOperand(Idx); + SDValue Op = PeekThroughFreeze(V.getOperand(Idx)); if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); @@ -23486,7 +23498,6 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, } // Try to shrink i64 compares if the input has enough zero bits. - // TODO: Add sign-bits equivalent for isX86CCSigned(X86CC)? if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) && Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub. DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) && @@ -23496,6 +23507,16 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1); } + // Try to shrink all i64 compares if the inputs are representable as signed + // i32. + if (CmpVT == MVT::i64 && + Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub. + DAG.ComputeNumSignBits(Op1) > 32 && DAG.ComputeNumSignBits(Op0) > 32) { + CmpVT = MVT::i32; + Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0); + Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1); + } + // 0-x == y --> x+y == 0 // 0-x != y --> x+y != 0 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) && @@ -43165,7 +43186,7 @@ static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) { /// the fact that they're unused. static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, - bool &IsSubAdd) { + bool &IsSubAdd, bool &HasAllowContract) { EVT VT = N->getValueType(0); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -43216,6 +43237,8 @@ static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget, // It's a subadd if the vector in the even parity is an FADD. IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD : V2->getOpcode() == ISD::FADD; + HasAllowContract = + V1->getFlags().hasAllowContract() && V2->getFlags().hasAllowContract(); Opnd0 = LHS; Opnd1 = RHS; @@ -43273,14 +43296,17 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, SDValue Opnd0, Opnd1; bool IsSubAdd; - if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd)) + bool HasAllowContract; + if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd, + HasAllowContract)) return SDValue(); MVT VT = N->getSimpleValueType(0); // Try to generate X86ISD::FMADDSUB node here. SDValue Opnd2; - if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) { + if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2, + HasAllowContract)) { unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB; return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2); } @@ -54220,7 +54246,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, } // Try to form a MULHU or MULHS node by looking for -// (trunc (srl (mul ext, ext), 16)) +// (trunc (srl (mul ext, ext), >= 16)) // TODO: This is X86 specific because we want to be able to handle wide types // before type legalization. But we can only do it if the vector will be // legalized via widening/splitting. Type legalization can't handle promotion @@ -54245,10 +54271,16 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, // First instruction should be a right shift by 16 of a multiply. SDValue LHS, RHS; + APInt ShiftAmt; if (!sd_match(Src, - m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_SpecificInt(16)))) + m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_ConstInt(ShiftAmt)))) return SDValue(); + if (ShiftAmt.ult(16) || ShiftAmt.uge(InVT.getScalarSizeInBits())) + return SDValue(); + + uint64_t AdditionalShift = ShiftAmt.getZExtValue() - 16; + // Count leading sign/zero bits on both inputs - if there are enough then // truncation back to vXi16 will be cheap - either as a pack/shuffle // sequence or using AVX512 truncations. If the inputs are sext/zext then the @@ -54286,7 +54318,9 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, InVT.getSizeInBits() / 16); SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS), DAG.getBitcast(BCVT, RHS)); - return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res)); + Res = DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res)); + return DAG.getNode(ISD::SRL, DL, VT, Res, + DAG.getShiftAmountConstant(AdditionalShift, VT, DL)); } // Truncate back to source type. @@ -54294,7 +54328,9 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS); unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU; - return DAG.getNode(Opc, DL, VT, LHS, RHS); + SDValue Res = DAG.getNode(Opc, DL, VT, LHS, RHS); + return DAG.getNode(ISD::SRL, DL, VT, Res, + DAG.getShiftAmountConstant(AdditionalShift, VT, DL)); } // Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 5862c7e..7c594d0 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -2781,6 +2781,38 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, return Bytes == MFI.getObjectSize(FI); } +static bool +mayBeSRetTailCallCompatible(const TargetLowering::CallLoweringInfo &CLI, + Register CallerSRetReg) { + const auto &Outs = CLI.Outs; + const auto &OutVals = CLI.OutVals; + + // We know the caller has a sret pointer argument (CallerSRetReg). Locate the + // operand index within the callee that may have a sret pointer too. + unsigned Pos = 0; + for (unsigned E = Outs.size(); Pos != E; ++Pos) + if (Outs[Pos].Flags.isSRet()) + break; + // Bail out if the callee has not any sret argument. + if (Pos == Outs.size()) + return false; + + // At this point, either the caller is forwarding its sret argument to the + // callee, or the callee is being passed a different sret pointer. We now look + // for a CopyToReg, where the callee sret argument is written into a new vreg + // (which should later be %rax/%eax, if this is returned). + SDValue SRetArgVal = OutVals[Pos]; + for (SDNode *User : SRetArgVal->users()) { + if (User->getOpcode() != ISD::CopyToReg) + continue; + Register Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg(); + if (Reg == CallerSRetReg && User->getOperand(2) == SRetArgVal) + return true; + } + + return false; +} + /// Check whether the call is eligible for tail call optimization. Targets /// that want to do tail call optimization should implement this function. /// Note that the x86 backend does not check musttail calls for eligibility! The @@ -2802,6 +2834,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( // If -tailcallopt is specified, make fastcc functions tail-callable. MachineFunction &MF = DAG.getMachineFunction(); + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); const Function &CallerF = MF.getFunction(); // If the function return type is x86_fp80 and the callee return type is not, @@ -2838,14 +2871,15 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( if (RegInfo->hasStackRealignment(MF)) return false; - // Also avoid sibcall optimization if we're an sret return fn and the callee - // is incompatible. See comment in LowerReturn about why hasStructRetAttr is - // insufficient. - if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) { + // Avoid sibcall optimization if we are an sret return function and the callee + // is incompatible, unless such premises are proven wrong. See comment in + // LowerReturn about why hasStructRetAttr is insufficient. + if (Register SRetReg = FuncInfo->getSRetReturnReg()) { // For a compatible tail call the callee must return our sret pointer. So it // needs to be (a) an sret function itself and (b) we pass our sret as its // sret. Condition #b is harder to determine. - return false; + if (!mayBeSRetTailCallCompatible(CLI, SRetReg)) + return false; } else if (IsCalleePopSRet) // The callee pops an sret, so we cannot tail-call, as our caller doesn't // expect that. @@ -2967,8 +3001,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg, MF.getTarget().Options.GuaranteedTailCallOpt); - if (unsigned BytesToPop = - MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) { + if (unsigned BytesToPop = FuncInfo->getBytesToPopOnReturn()) { // If we have bytes to pop, the callee must pop them. bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize; if (!CalleePopMatches) diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp index 9167794..08936ad 100644 --- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp +++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp @@ -37,8 +37,7 @@ public: std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &, uint64_t &) override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool writeNopData(raw_ostream &OS, uint64_t Count, const MCSubtargetInfo *STI) const override; @@ -153,9 +152,8 @@ std::optional<bool> XtensaAsmBackend::evaluateFixup(const MCFragment &F, } void XtensaAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { maybeAddReloc(F, Fixup, Target, Value, IsResolved); MCContext &Ctx = getContext(); MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind()); @@ -168,11 +166,10 @@ void XtensaAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, if (!Value) return; // Doesn't change encoding. - unsigned Offset = Fixup.getOffset(); unsigned FullSize = getSize(Fixup.getKind()); for (unsigned i = 0; i != FullSize; ++i) { - Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); + Data[i] |= uint8_t((Value >> (i * 8)) & 0xff); } } |