aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp38
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp163
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h7
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp270
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.h4
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td69
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.td3
-rw-r--r--llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp23
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp2
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp3
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp2
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td55
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp61
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp151
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp21
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp317
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp239
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp25
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp31
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h3
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td176
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.h40
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h21
-rw-r--r--llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/R600TargetMachine.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h12
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp69
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp61
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h3
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td8
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td10
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp36
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td43
-rw-r--r--llvm/lib/Target/AMDGPU/SMInstructions.td5
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td17
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td177
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp12
-rw-r--r--llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp4
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp14
-rw-r--r--llvm/lib/Target/AVR/AVRAsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp2
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h4
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp2
-rw-r--r--llvm/lib/Target/BPF/BTFDebug.cpp6
-rw-r--r--llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp7
-rw-r--r--llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp1
-rw-r--r--llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h5
-rw-r--r--llvm/lib/Target/DirectX/DXILDataScalarization.cpp46
-rw-r--r--llvm/lib/Target/DirectX/DXILFlattenArrays.cpp9
-rw-r--r--llvm/lib/Target/DirectX/DXILLegalizePass.cpp97
-rw-r--r--llvm/lib/Target/DirectX/DXILResourceAccess.cpp3
-rw-r--r--llvm/lib/Target/DirectX/DXILRootSignature.cpp471
-rw-r--r--llvm/lib/Target/DirectX/DXILRootSignature.h11
-rw-r--r--llvm/lib/Target/DirectX/DXILShaderFlags.cpp15
-rw-r--r--llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp1
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp31
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp1
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp2
-rw-r--r--llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp2
-rw-r--r--llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h2
-rw-r--r--llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp5
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp7
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td6
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp2
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h2
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp2
-rw-r--r--llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp32
-rw-r--r--llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp9
-rw-r--r--llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp111
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp1
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h1
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp6
-rw-r--r--llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/RISCV/RISCVFeatures.td6
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp62
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.h3
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.cpp3
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoC.td121
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoP.td54
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td95
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td33
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZc.td14
-rw-r--r--llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp15
-rw-r--r--llvm/lib/Target/RISCV/RISCVRegisterInfo.td5
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td200
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetMachine.cpp8
-rw-r--r--llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp35
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVAPI.cpp4
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp6
-rw-r--r--llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp1
-rw-r--r--llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp3
-rw-r--r--llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp4
-rw-r--r--llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp6
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssembly.td15
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp18
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td3
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td11
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp6
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp5
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h2
-rw-r--r--llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp30
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp40
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp1
-rw-r--r--llvm/lib/Target/X86/X86PassRegistry.def44
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp131
121 files changed, 2358 insertions, 1818 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index c4b43e1..c52487a 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -176,6 +176,9 @@ public:
std::optional<AArch64PACKey::ID> PACKey,
uint64_t PACDisc, Register PACAddrDisc);
+ // Emit the sequence for PAC.
+ void emitPtrauthSign(const MachineInstr *MI);
+
// Emit the sequence to compute the discriminator.
//
// The returned register is either unmodified AddrDisc or ScratchReg.
@@ -2175,6 +2178,37 @@ void AArch64AsmPrinter::emitPtrauthAuthResign(
OutStreamer->emitLabel(EndSym);
}
+void AArch64AsmPrinter::emitPtrauthSign(const MachineInstr *MI) {
+ Register Val = MI->getOperand(1).getReg();
+ auto Key = (AArch64PACKey::ID)MI->getOperand(2).getImm();
+ uint64_t Disc = MI->getOperand(3).getImm();
+ Register AddrDisc = MI->getOperand(4).getReg();
+ bool AddrDiscKilled = MI->getOperand(4).isKill();
+
+ // As long as at least one of Val and AddrDisc is in GPR64noip, a scratch
+ // register is available.
+ Register ScratchReg = Val == AArch64::X16 ? AArch64::X17 : AArch64::X16;
+ assert(ScratchReg != AddrDisc &&
+ "Neither X16 nor X17 is available as a scratch register");
+
+ // Compute pac discriminator
+ assert(isUInt<16>(Disc));
+ Register DiscReg = emitPtrauthDiscriminator(
+ Disc, AddrDisc, ScratchReg, /*MayUseAddrAsScratch=*/AddrDiscKilled);
+ bool IsZeroDisc = DiscReg == AArch64::XZR;
+ unsigned Opc = getPACOpcodeForKey(Key, IsZeroDisc);
+
+ // paciza x16 ; if IsZeroDisc
+ // pacia x16, x17 ; if !IsZeroDisc
+ MCInst PACInst;
+ PACInst.setOpcode(Opc);
+ PACInst.addOperand(MCOperand::createReg(Val));
+ PACInst.addOperand(MCOperand::createReg(Val));
+ if (!IsZeroDisc)
+ PACInst.addOperand(MCOperand::createReg(DiscReg));
+ EmitToStreamer(*OutStreamer, PACInst);
+}
+
void AArch64AsmPrinter::emitPtrauthBranch(const MachineInstr *MI) {
bool IsCall = MI->getOpcode() == AArch64::BLRA;
unsigned BrTarget = MI->getOperand(0).getReg();
@@ -2890,6 +2924,10 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
MI->getOperand(4).getImm(), MI->getOperand(5).getReg());
return;
+ case AArch64::PAC:
+ emitPtrauthSign(MI);
+ return;
+
case AArch64::LOADauthptrstatic:
LowerLOADauthptrstatic(*MI);
return;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 02ee517..7b49754 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -164,6 +164,9 @@ static cl::opt<bool> UseFEATCPACodegen(
/// Value type used for condition codes.
static const MVT MVT_CC = MVT::i32;
+/// Value type used for NZCV flags.
+static constexpr MVT FlagsVT = MVT::i32;
+
static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
AArch64::X3, AArch64::X4, AArch64::X5,
AArch64::X6, AArch64::X7};
@@ -3098,6 +3101,83 @@ AArch64TargetLowering::EmitGetSMESaveSize(MachineInstr &MI,
return BB;
}
+// Helper function to find the instruction that defined a virtual register.
+// If unable to find such instruction, returns nullptr.
+static const MachineInstr *stripVRegCopies(const MachineRegisterInfo &MRI,
+ Register Reg) {
+ while (Reg.isVirtual()) {
+ MachineInstr *DefMI = MRI.getVRegDef(Reg);
+ assert(DefMI && "Virtual register definition not found");
+ unsigned Opcode = DefMI->getOpcode();
+
+ if (Opcode == AArch64::COPY) {
+ Reg = DefMI->getOperand(1).getReg();
+ // Vreg is defined by copying from physreg.
+ if (Reg.isPhysical())
+ return DefMI;
+ continue;
+ }
+ if (Opcode == AArch64::SUBREG_TO_REG) {
+ Reg = DefMI->getOperand(2).getReg();
+ continue;
+ }
+
+ return DefMI;
+ }
+ return nullptr;
+}
+
+void AArch64TargetLowering::fixupPtrauthDiscriminator(
+ MachineInstr &MI, MachineBasicBlock *BB, MachineOperand &IntDiscOp,
+ MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const {
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ Register AddrDisc = AddrDiscOp.getReg();
+ int64_t IntDisc = IntDiscOp.getImm();
+ assert(IntDisc == 0 && "Blend components are already expanded");
+
+ const MachineInstr *DiscMI = stripVRegCopies(MRI, AddrDisc);
+ if (DiscMI) {
+ switch (DiscMI->getOpcode()) {
+ case AArch64::MOVKXi:
+ // blend(addr, imm) which is lowered as "MOVK addr, #imm, #48".
+ // #imm should be an immediate and not a global symbol, for example.
+ if (DiscMI->getOperand(2).isImm() &&
+ DiscMI->getOperand(3).getImm() == 48) {
+ AddrDisc = DiscMI->getOperand(1).getReg();
+ IntDisc = DiscMI->getOperand(2).getImm();
+ }
+ break;
+ case AArch64::MOVi32imm:
+ case AArch64::MOVi64imm:
+ // Small immediate integer constant passed via VReg.
+ if (DiscMI->getOperand(1).isImm() &&
+ isUInt<16>(DiscMI->getOperand(1).getImm())) {
+ AddrDisc = AArch64::NoRegister;
+ IntDisc = DiscMI->getOperand(1).getImm();
+ }
+ break;
+ }
+ }
+
+ // For uniformity, always use NoRegister, as XZR is not necessarily contained
+ // in the requested register class.
+ if (AddrDisc == AArch64::XZR)
+ AddrDisc = AArch64::NoRegister;
+
+ // Make sure AddrDisc operand respects the register class imposed by MI.
+ if (AddrDisc && MRI.getRegClass(AddrDisc) != AddrDiscRC) {
+ Register TmpReg = MRI.createVirtualRegister(AddrDiscRC);
+ BuildMI(*BB, MI, DL, TII->get(AArch64::COPY), TmpReg).addReg(AddrDisc);
+ AddrDisc = TmpReg;
+ }
+
+ AddrDiscOp.setReg(AddrDisc);
+ IntDiscOp.setImm(IntDisc);
+}
+
MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
MachineInstr &MI, MachineBasicBlock *BB) const {
@@ -3196,6 +3276,11 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
case AArch64::MOVT_TIZ_PSEUDO:
return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true);
+
+ case AArch64::PAC:
+ fixupPtrauthDiscriminator(MI, BB, MI.getOperand(3), MI.getOperand(4),
+ &AArch64::GPR64noipRegClass);
+ return BB;
}
}
@@ -3451,7 +3536,7 @@ static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &DL,
}
unsigned Opcode =
IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
- return DAG.getNode(Opcode, DL, {MVT::i32, MVT::Other}, {Chain, LHS, RHS});
+ return DAG.getNode(Opcode, DL, {FlagsVT, MVT::Other}, {Chain, LHS, RHS});
}
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
@@ -3465,7 +3550,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
}
- return DAG.getNode(AArch64ISD::FCMP, DL, MVT::i32, LHS, RHS);
+ return DAG.getNode(AArch64ISD::FCMP, DL, FlagsVT, LHS, RHS);
}
// The CMP instruction is just an alias for SUBS, and representing it as
@@ -3490,7 +3575,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
// (a.k.a. ANDS) except that the flags are only guaranteed to work for one
// of the signed comparisons.
const SDValue ANDSNode =
- DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, MVT_CC),
+ DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, FlagsVT),
LHS.getOperand(0), LHS.getOperand(1));
// Replace all users of (and X, Y) with newly generated (ands X, Y)
DAG.ReplaceAllUsesWith(LHS, ANDSNode);
@@ -3501,7 +3586,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
}
}
- return DAG.getNode(Opcode, DL, DAG.getVTList(VT, MVT_CC), LHS, RHS)
+ return DAG.getNode(Opcode, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS)
.getValue(1);
}
@@ -3597,7 +3682,7 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
- return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
+ return DAG.getNode(Opcode, DL, FlagsVT, LHS, RHS, NZCVOp, Condition, CCOp);
}
/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
@@ -4036,7 +4121,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
// Check that the result fits into a 32-bit integer.
- SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
+ SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
if (IsSigned) {
// cmp xreg, wreg, sxtw
SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
@@ -4059,12 +4144,12 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
DAG.getConstant(63, DL, MVT::i64));
// It is important that LowerBits is last, otherwise the arithmetic
// shift will not be folded into the compare (SUBS).
- SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+ SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
.getValue(1);
} else {
SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
- SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+ SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
Overflow =
DAG.getNode(AArch64ISD::SUBS, DL, VTs,
DAG.getConstant(0, DL, MVT::i64),
@@ -4075,7 +4160,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
} // switch (...)
if (Opc) {
- SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
+ SDVTList VTs = DAG.getVTList(Op->getValueType(0), FlagsVT);
// Emit the AArch64 operation with overflow check.
Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
@@ -4177,7 +4262,7 @@ static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) {
SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
SDValue Cmp =
- DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
+ DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Op0, Op1);
return Cmp.getValue(1);
}
@@ -4220,16 +4305,15 @@ static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG,
SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
SDLoc DL(Op);
- SDVTList VTs = DAG.getVTList(VT0, VT1);
- SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
+ SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, FlagsVT), OpLHS,
OpRHS, OpCarryIn);
SDValue OutFlag =
IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
: carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
- return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
+ return DAG.getMergeValues({Sum, OutFlag}, DL);
}
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
@@ -4254,8 +4338,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
Overflow =
DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow);
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
- return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Value, Overflow);
+ return DAG.getMergeValues({Value, Overflow}, DL);
}
// Prefetch operands are:
@@ -6813,7 +6896,8 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
SDValue Result = DAG.getMemIntrinsicNode(
AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
- {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
+ {StoreNode->getChain(), DAG.getBitcast(MVT::v2i64, Lo),
+ DAG.getBitcast(MVT::v2i64, Hi), StoreNode->getBasePtr()},
StoreNode->getMemoryVT(), StoreNode->getMemOperand());
return Result;
}
@@ -7037,9 +7121,8 @@ SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
Op.getOperand(0));
// Generate SUBS & CSEL.
- SDValue Cmp =
- DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
- Op.getOperand(0), DAG.getConstant(0, DL, VT));
+ SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
+ Op.getOperand(0), DAG.getConstant(0, DL, VT));
return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
Cmp.getValue(1));
@@ -11108,7 +11191,7 @@ SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
SDValue Carry = Op.getOperand(2);
// SBCS uses a carry not a borrow so the carry flag should be inverted first.
SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
- SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
+ SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, FlagsVT),
LHS, RHS, InvCarry);
EVT OpVT = Op.getValueType();
@@ -12441,10 +12524,10 @@ SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
// Get NZCV register. Only update chain when copyfrom is glued.
if (Glue.getNode()) {
- Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
+ Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT, Glue);
Chain = Glue.getValue(1);
} else
- Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
+ Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT);
// Extract CC code.
SDValue CC = getSETCC(Cond, Glue, DL, DAG);
@@ -18020,11 +18103,14 @@ bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
unsigned ShlAmt = C2->getZExtValue();
if (auto ShouldADD = *N->user_begin();
ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
- if (auto ShouldLOAD = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
- unsigned ByteVT = ShouldLOAD->getMemoryVT().getSizeInBits() / 8;
- if ((1ULL << ShlAmt) == ByteVT &&
- isIndexedLoadLegal(ISD::PRE_INC, ShouldLOAD->getMemoryVT()))
- return false;
+ if (auto Load = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
+ EVT MemVT = Load->getMemoryVT();
+
+ if (Load->getValueType(0).isScalableVector())
+ return (8ULL << ShlAmt) != MemVT.getScalarSizeInBits();
+
+ if (isIndexedLoadLegal(ISD::PRE_INC, MemVT))
+ return (8ULL << ShlAmt) != MemVT.getFixedSizeInBits();
}
}
}
@@ -18593,7 +18679,7 @@ AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
Created.push_back(And.getNode());
} else {
SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
- SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ SDVTList VTs = DAG.getVTList(VT, FlagsVT);
SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
@@ -19482,10 +19568,10 @@ static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
// can select to CCMN to avoid the extra mov
SDValue AbsOp1 =
DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
- CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
- NZCVOp, Condition, Cmp0);
+ CCmp = DAG.getNode(AArch64ISD::CCMN, DL, FlagsVT, Cmp1.getOperand(0),
+ AbsOp1, NZCVOp, Condition, Cmp0);
} else {
- CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
+ CCmp = DAG.getNode(AArch64ISD::CCMP, DL, FlagsVT, Cmp1.getOperand(0),
Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
}
return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
@@ -25134,8 +25220,9 @@ static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG) {
if (!TReassocOp && !FReassocOp)
return SDValue();
- SDValue NewCmp = DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),
- DAG.getVTList(VT, MVT_CC), CmpOpOther, SubsOp);
+ SDValue NewCmp =
+ DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),
+ DAG.getVTList(VT, FlagsVT), CmpOpOther, SubsOp);
auto Reassociate = [&](SDValue ReassocOp, unsigned OpNum) {
if (!ReassocOp)
@@ -27161,7 +27248,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
: AArch64SysReg::RNDRRS);
SDLoc DL(N);
SDValue A = DAG.getNode(
- AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::i32, MVT::Other),
+ AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other),
N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
SDValue B = DAG.getNode(
AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
@@ -27907,16 +27994,16 @@ void AArch64TargetLowering::ReplaceNodeResults(
MemVT.getScalarSizeInBits() == 32u ||
MemVT.getScalarSizeInBits() == 64u)) {
+ EVT HalfVT = MemVT.getHalfNumVectorElementsVT(*DAG.getContext());
SDValue Result = DAG.getMemIntrinsicNode(
AArch64ISD::LDNP, SDLoc(N),
- DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
- MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
- MVT::Other}),
+ DAG.getVTList({MVT::v2i64, MVT::v2i64, MVT::Other}),
{LoadNode->getChain(), LoadNode->getBasePtr()},
LoadNode->getMemoryVT(), LoadNode->getMemOperand());
SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
- Result.getValue(0), Result.getValue(1));
+ DAG.getBitcast(HalfVT, Result.getValue(0)),
+ DAG.getBitcast(HalfVT, Result.getValue(1)));
Results.append({Pair, Result.getValue(2) /* Chain */});
return;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index d8403c2..95d0e3b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -182,6 +182,13 @@ public:
MachineBasicBlock *EmitGetSMESaveSize(MachineInstr &MI,
MachineBasicBlock *BB) const;
+ /// Replace (0, vreg) discriminator components with the operands of blend
+ /// or with (immediate, NoRegister) when possible.
+ void fixupPtrauthDiscriminator(MachineInstr &MI, MachineBasicBlock *BB,
+ MachineOperand &IntDiscOp,
+ MachineOperand &AddrDiscOp,
+ const TargetRegisterClass *AddrDiscRC) const;
+
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *MBB) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index bc57537..8685d7a0 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -20,7 +20,6 @@
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CFIInstBuilder.h"
#include "llvm/CodeGen/LivePhysRegs.h"
@@ -36,7 +35,6 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/StackMaps.h"
-#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DebugInfoMetadata.h"
@@ -533,8 +531,9 @@ bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
MBP.LHS = LastInst->getOperand(0);
MBP.RHS = MachineOperand::CreateImm(0);
- MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
- : MachineBranchPredicate::PRED_EQ;
+ MBP.Predicate = (LastOpc == AArch64::CBNZX || LastOpc == AArch64::CBNZW)
+ ? MachineBranchPredicate::PRED_NE
+ : MachineBranchPredicate::PRED_EQ;
return false;
}
@@ -7353,9 +7352,6 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
- case AArch64MachineCombinerPattern::GATHER_LANE_i32:
- case AArch64MachineCombinerPattern::GATHER_LANE_i16:
- case AArch64MachineCombinerPattern::GATHER_LANE_i8:
return true;
} // end switch (Pattern)
return false;
@@ -7396,252 +7392,11 @@ static bool getMiscPatterns(MachineInstr &Root,
return false;
}
-static bool getGatherPattern(MachineInstr &Root,
- SmallVectorImpl<unsigned> &Patterns,
- unsigned LoadLaneOpCode, unsigned NumLanes) {
- const MachineFunction *MF = Root.getMF();
-
- // Early exit if optimizing for size.
- if (MF->getFunction().hasMinSize())
- return false;
-
- const MachineRegisterInfo &MRI = MF->getRegInfo();
- const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
-
- // The root of the pattern must load into the last lane of the vector.
- if (Root.getOperand(2).getImm() != NumLanes - 1)
- return false;
-
- // Check that we have load into all lanes except lane 0.
- // For each load we also want to check that:
- // 1. It has a single non-debug use (since we will be replacing the virtual
- // register)
- // 2. That the addressing mode only uses a single offset register.
- auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
- auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
- SmallSet<unsigned, 4> RemainingLanes(Range.begin(), Range.end());
- while (!RemainingLanes.empty() && CurrInstr &&
- CurrInstr->getOpcode() == LoadLaneOpCode &&
- MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
- CurrInstr->getNumOperands() == 4) {
- RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
- CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
- }
-
- if (!RemainingLanes.empty())
- return false;
-
- // Match the SUBREG_TO_REG sequence.
- if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
- return false;
-
- // Verify that the subreg to reg loads an integer into the first lane.
- auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
- unsigned SingleLaneSizeInBits = 128 / NumLanes;
- if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
- return false;
-
- // Verify that it also has a single non debug use.
- if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
- return false;
-
- switch (NumLanes) {
- case 4:
- Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32);
- break;
- case 8:
- Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16);
- break;
- case 16:
- Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8);
- break;
- default:
- llvm_unreachable("Got bad number of lanes for gather pattern.");
- }
-
- return true;
-}
-
-/// Search for patterns where we use LD1 instructions to load into
-/// separate lanes of an 128 bit Neon register. We can increase Memory Level
-/// Parallelism by loading into 2 Neon registers instead.
-static bool getLoadPatterns(MachineInstr &Root,
- SmallVectorImpl<unsigned> &Patterns) {
-
- // The pattern searches for loads into single lanes.
- switch (Root.getOpcode()) {
- case AArch64::LD1i32:
- return getGatherPattern(Root, Patterns, Root.getOpcode(), 4);
- case AArch64::LD1i16:
- return getGatherPattern(Root, Patterns, Root.getOpcode(), 8);
- case AArch64::LD1i8:
- return getGatherPattern(Root, Patterns, Root.getOpcode(), 16);
- default:
- return false;
- }
-}
-
-static void
-generateGatherPattern(MachineInstr &Root,
- SmallVectorImpl<MachineInstr *> &InsInstrs,
- SmallVectorImpl<MachineInstr *> &DelInstrs,
- DenseMap<Register, unsigned> &InstrIdxForVirtReg,
- unsigned Pattern, unsigned NumLanes) {
-
- MachineFunction &MF = *Root.getParent()->getParent();
- MachineRegisterInfo &MRI = MF.getRegInfo();
- const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-
- // Gather the initial load instructions to build the pattern
- SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
- MachineInstr *CurrInstr = &Root;
- for (unsigned i = 0; i < NumLanes - 1; ++i) {
- LoadToLaneInstrs.push_back(CurrInstr);
- CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
- }
-
- // Sort the load instructions according to the lane.
- llvm::sort(LoadToLaneInstrs,
- [](const MachineInstr *A, const MachineInstr *B) {
- return A->getOperand(2).getImm() > B->getOperand(2).getImm();
- });
-
- MachineInstr *SubregToReg = CurrInstr;
- LoadToLaneInstrs.push_back(
- MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg()));
- auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
-
- const TargetRegisterClass *FPR128RegClass =
- MRI.getRegClass(Root.getOperand(0).getReg());
-
- auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
- Register SrcRegister, unsigned Lane,
- Register OffsetRegister) {
- auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
- MachineInstrBuilder LoadIndexIntoRegister =
- BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
- NewRegister)
- .addReg(SrcRegister)
- .addImm(Lane)
- .addReg(OffsetRegister, getKillRegState(true));
- InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
- InsInstrs.push_back(LoadIndexIntoRegister);
- return NewRegister;
- };
-
- // Helper to create load instruction based on opcode
- auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg,
- Register OffsetReg) -> MachineInstrBuilder {
- unsigned Opcode;
- switch (NumLanes) {
- case 4:
- Opcode = AArch64::LDRSui;
- break;
- case 8:
- Opcode = AArch64::LDRHui;
- break;
- case 16:
- Opcode = AArch64::LDRBui;
- break;
- default:
- llvm_unreachable(
- "Got unsupported number of lanes in machine-combiner gather pattern");
- }
- // Immediate offset load
- return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
- .addReg(OffsetReg)
- .addImm(0); // immediate offset
- };
-
- // Load the remaining lanes into register 0.
- auto LanesToLoadToReg0 =
- llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
- LoadToLaneInstrsAscending.begin() + NumLanes / 2);
- auto PrevReg = SubregToReg->getOperand(0).getReg();
- for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
- PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
- LoadInstr->getOperand(3).getReg());
- DelInstrs.push_back(LoadInstr);
- }
- auto LastLoadReg0 = PrevReg;
-
- // First load into register 1. Perform a LDRSui to zero out the upper lanes in
- // a single instruction.
- auto Lane0Load = *LoadToLaneInstrsAscending.begin();
- auto OriginalSplitLoad =
- *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
- auto DestRegForMiddleIndex = MRI.createVirtualRegister(
- MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
-
- MachineInstrBuilder MiddleIndexLoadInstr =
- CreateLoadInstruction(NumLanes, DestRegForMiddleIndex,
- OriginalSplitLoad->getOperand(3).getReg());
-
- InstrIdxForVirtReg.insert(
- std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
- InsInstrs.push_back(MiddleIndexLoadInstr);
- DelInstrs.push_back(OriginalSplitLoad);
-
- // Subreg To Reg instruction for register 1.
- auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
- unsigned SubregType;
- switch (NumLanes) {
- case 4:
- SubregType = AArch64::ssub;
- break;
- case 8:
- SubregType = AArch64::hsub;
- break;
- case 16:
- SubregType = AArch64::bsub;
- break;
- default:
- llvm_unreachable(
- "Got invalid NumLanes for machine-combiner gather pattern");
- }
-
- auto SubRegToRegInstr =
- BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
- DestRegForSubregToReg)
- .addImm(0)
- .addReg(DestRegForMiddleIndex, getKillRegState(true))
- .addImm(SubregType);
- InstrIdxForVirtReg.insert(
- std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
- InsInstrs.push_back(SubRegToRegInstr);
-
- // Load remaining lanes into register 1.
- auto LanesToLoadToReg1 =
- llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
- LoadToLaneInstrsAscending.end());
- PrevReg = SubRegToRegInstr->getOperand(0).getReg();
- for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
- PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
- LoadInstr->getOperand(3).getReg());
- if (Index == NumLanes / 2 - 2) {
- break;
- }
- DelInstrs.push_back(LoadInstr);
- }
- auto LastLoadReg1 = PrevReg;
-
- // Create the final zip instruction to combine the results.
- MachineInstrBuilder ZipInstr =
- BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
- Root.getOperand(0).getReg())
- .addReg(LastLoadReg0)
- .addReg(LastLoadReg1);
- InsInstrs.push_back(ZipInstr);
-}
-
CombinerObjective
AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
switch (Pattern) {
case AArch64MachineCombinerPattern::SUBADD_OP1:
case AArch64MachineCombinerPattern::SUBADD_OP2:
- case AArch64MachineCombinerPattern::GATHER_LANE_i32:
- case AArch64MachineCombinerPattern::GATHER_LANE_i16:
- case AArch64MachineCombinerPattern::GATHER_LANE_i8:
return CombinerObjective::MustReduceDepth;
default:
return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -7671,10 +7426,6 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
if (getMiscPatterns(Root, Patterns))
return true;
- // Load patterns
- if (getLoadPatterns(Root, Patterns))
- return true;
-
return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
DoRegPressureReduce);
}
@@ -8930,21 +8681,6 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
break;
}
- case AArch64MachineCombinerPattern::GATHER_LANE_i32: {
- generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
- Pattern, 4);
- break;
- }
- case AArch64MachineCombinerPattern::GATHER_LANE_i16: {
- generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
- Pattern, 8);
- break;
- }
- case AArch64MachineCombinerPattern::GATHER_LANE_i8: {
- generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
- Pattern, 16);
- break;
- }
} // end switch (Pattern)
// Record MUL and ADD/SUB for deletion
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 02734866..7c255da 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -172,10 +172,6 @@ enum AArch64MachineCombinerPattern : unsigned {
FMULv8i16_indexed_OP2,
FNMADD,
-
- GATHER_LANE_i32,
- GATHER_LANE_i16,
- GATHER_LANE_i8
};
class AArch64InstrInfo final : public AArch64GenInstrInfo {
const AArch64RegisterInfo RI;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 9f8a257..07cacfa 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -430,26 +430,27 @@ def UseWzrToVecMove : Predicate<"Subtarget->useWzrToVecMove()">;
def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2,
[SDTCisSameAs<0, 2>,
SDTCisSameAs<0, 3>,
- SDTCisInt<0>, SDTCisVT<1, i32>]>;
+ SDTCisInt<0>,
+ SDTCisVT<1, FlagsVT>]>;
// SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS
def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3,
[SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>,
SDTCisInt<0>,
- SDTCisVT<3, i32>]>;
+ SDTCisVT<3, FlagsVT>]>;
// SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS
def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
[SDTCisSameAs<0, 2>,
SDTCisSameAs<0, 3>,
SDTCisInt<0>,
- SDTCisVT<1, i32>,
- SDTCisVT<4, i32>]>;
+ SDTCisVT<1, FlagsVT>,
+ SDTCisVT<4, FlagsVT>]>;
def SDT_AArch64Brcond : SDTypeProfile<0, 3,
[SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>,
- SDTCisVT<2, i32>]>;
+ SDTCisVT<2, FlagsVT>]>;
def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
SDTCisVT<2, OtherVT>]>;
@@ -458,22 +459,22 @@ def SDT_AArch64CSel : SDTypeProfile<1, 4,
[SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>,
SDTCisInt<3>,
- SDTCisVT<4, i32>]>;
+ SDTCisVT<4, FlagsVT>]>;
def SDT_AArch64CCMP : SDTypeProfile<1, 5,
- [SDTCisVT<0, i32>,
+ [SDTCisVT<0, FlagsVT>,
SDTCisInt<1>,
SDTCisSameAs<1, 2>,
SDTCisInt<3>,
SDTCisInt<4>,
SDTCisVT<5, i32>]>;
def SDT_AArch64FCCMP : SDTypeProfile<1, 5,
- [SDTCisVT<0, i32>,
+ [SDTCisVT<0, FlagsVT>,
SDTCisFP<1>,
SDTCisSameAs<1, 2>,
SDTCisInt<3>,
SDTCisInt<4>,
SDTCisVT<5, i32>]>;
-def SDT_AArch64FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+def SDT_AArch64FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, FlagsVT>,
SDTCisFP<1>,
SDTCisSameAs<2, 1>]>;
def SDT_AArch64Rev : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>;
@@ -518,10 +519,10 @@ def SDT_AArch64uaddlp : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
def SDT_AArch64ldiapp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
-def SDT_AArch64ldnp : SDTypeProfile<2, 1, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def SDT_AArch64ldnp : SDTypeProfile<2, 1, [SDTCisVT<0, v2i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
def SDT_AArch64stilp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
-def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
// Generates the general dynamic sequences, i.e.
// adrp x0, :tlsdesc:var
@@ -1124,10 +1125,10 @@ def AArch64probedalloca
SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
[SDNPHasChain, SDNPMayStore]>;
-// MRS, also sets the flags via a glue.
+// MRS, also sets the flags.
def AArch64mrs : SDNode<"AArch64ISD::MRS",
SDTypeProfile<2, 1, [SDTCisVT<0, i64>,
- SDTCisVT<1, i32>,
+ SDTCisVT<1, FlagsVT>,
SDTCisVT<2, i32>]>,
[SDNPHasChain]>;
@@ -2032,7 +2033,7 @@ let Predicates = [HasPAuth] in {
def DZB : SignAuthZero<prefix_z, 0b11, !strconcat(asm, "dzb"), op>;
}
- defm PAC : SignAuth<0b000, 0b010, "pac", int_ptrauth_sign>;
+ defm PAC : SignAuth<0b000, 0b010, "pac", null_frag>;
defm AUT : SignAuth<0b001, 0b011, "aut", null_frag>;
def XPACI : ClearAuth<0, "xpaci">;
@@ -2152,6 +2153,26 @@ let Predicates = [HasPAuth] in {
let Uses = [];
}
+ // PAC pseudo instruction. In AsmPrinter, it is expanded into an actual PAC*
+ // instruction immediately preceded by the discriminator computation.
+ // This enforces the expected immediate modifier is used for signing, even
+ // if an attacker is able to substitute AddrDisc.
+ def PAC : Pseudo<(outs GPR64:$SignedVal),
+ (ins GPR64:$Val, i32imm:$Key, i64imm:$Disc, GPR64noip:$AddrDisc),
+ [], "$SignedVal = $Val">, Sched<[WriteI, ReadI]> {
+ let isCodeGenOnly = 1;
+ let hasSideEffects = 0;
+ let mayStore = 0;
+ let mayLoad = 0;
+ let Size = 12;
+ let Defs = [X16, X17];
+ let usesCustomInserter = 1;
+ }
+
+ // A standalone pattern is used, so that literal 0 can be passed as $Disc.
+ def : Pat<(int_ptrauth_sign GPR64:$Val, timm:$Key, GPR64noip:$AddrDisc),
+ (PAC GPR64:$Val, $Key, 0, GPR64noip:$AddrDisc)>;
+
// AUT and re-PAC a value, using different keys/data.
// This directly manipulates x16/x17, which are the only registers that
// certain OSs guarantee are safe to use for sensitive operations.
@@ -3934,6 +3955,26 @@ defm LDRSW : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",
def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
(SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
+// load zero-extended i32, bitcast to f64
+def : Pat <(f64 (bitconvert (i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
+
+// load zero-extended i16, bitcast to f64
+def : Pat <(f64 (bitconvert (i64 (zextloadi16 (am_indexed32 GPR64sp:$Rn, uimm12s2:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+
+// load zero-extended i8, bitcast to f64
+def : Pat <(f64 (bitconvert (i64 (zextloadi8 (am_indexed32 GPR64sp:$Rn, uimm12s1:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+
+// load zero-extended i16, bitcast to f32
+def : Pat <(f32 (bitconvert (i32 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+ (SUBREG_TO_REG (i32 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+
+// load zero-extended i8, bitcast to f32
+def : Pat <(f32 (bitconvert (i32 (zextloadi8 (am_indexed16 GPR64sp:$Rn, uimm12s1:$offset))))),
+ (SUBREG_TO_REG (i32 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+
// Pre-fetch.
def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
[(AArch64Prefetch timm:$Rt,
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index 61bf87f..1a7609b 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -305,7 +305,8 @@ def GPR64pi48 : RegisterOperand<GPR64, "printPostIncOperand<48>">;
def GPR64pi64 : RegisterOperand<GPR64, "printPostIncOperand<64>">;
// Condition code regclass.
-def CCR : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
+defvar FlagsVT = i32;
+def CCR : RegisterClass<"AArch64", [FlagsVT], 32, (add NZCV)> {
let CopyCost = -1; // Don't allow copying of status registers.
// CCR is not allocatable.
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index bafb8d0..8a5b5ba 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -32,10 +32,29 @@ AArch64SelectionDAGInfo::AArch64SelectionDAGInfo()
void AArch64SelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG,
const SDNode *N) const {
+ SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N);
+
#ifndef NDEBUG
+ // Some additional checks not yet implemented by verifyTargetNode.
+ constexpr MVT FlagsVT = MVT::i32;
switch (N->getOpcode()) {
- default:
- return SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N);
+ case AArch64ISD::SUBS:
+ assert(N->getValueType(1) == FlagsVT);
+ break;
+ case AArch64ISD::ADC:
+ case AArch64ISD::SBC:
+ assert(N->getOperand(2).getValueType() == FlagsVT);
+ break;
+ case AArch64ISD::ADCS:
+ case AArch64ISD::SBCS:
+ assert(N->getValueType(1) == FlagsVT);
+ assert(N->getOperand(2).getValueType() == FlagsVT);
+ break;
+ case AArch64ISD::CSEL:
+ case AArch64ISD::CSINC:
+ case AArch64ISD::BRCOND:
+ assert(N->getOperand(3).getValueType() == FlagsVT);
+ break;
case AArch64ISD::SADDWT:
case AArch64ISD::SADDWB:
case AArch64ISD::UADDWT:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
index c218831..85de2d5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -36,7 +36,7 @@ void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx,
// SHF_AARCH64_PURECODE flag set if the "+execute-only" target feature is
// present.
if (TM.getMCSubtargetInfo()->hasFeature(AArch64::FeatureExecuteOnly)) {
- auto *Text = cast<MCSectionELF>(TextSection);
+ auto *Text = static_cast<MCSectionELF *>(TextSection);
Text->setFlags(Text->getFlags() | ELF::SHF_AARCH64_PURECODE);
}
}
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 08f547a..6257e99 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -523,7 +523,8 @@ void AArch64TargetELFStreamer::finish() {
// mark it execute-only if it is empty and there is at least one
// execute-only section in the object.
if (any_of(Asm, [](const MCSection &Sec) {
- return cast<MCSectionELF>(Sec).getFlags() & ELF::SHF_AARCH64_PURECODE;
+ return static_cast<const MCSectionELF &>(Sec).getFlags() &
+ ELF::SHF_AARCH64_PURECODE;
})) {
auto *Text =
static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection());
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 3d4a14b..1a9bce5 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -9,8 +9,6 @@
#include "AArch64MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index 1ac340a..a22a17a 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -132,7 +132,8 @@ static bool canUseLocalRelocation(const MCSectionMachO &Section,
// But only if they don't point to a few forbidden sections.
if (!Symbol.isInSection())
return true;
- const MCSectionMachO &RefSec = cast<MCSectionMachO>(Symbol.getSection());
+ const MCSectionMachO &RefSec =
+ static_cast<MCSectionMachO &>(Symbol.getSection());
if (RefSec.getType() == MachO::S_CSTRING_LITERALS)
return false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index e4e7bdc..8b8fc8b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -149,6 +149,12 @@ def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts",
"Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions"
>;
+def FeatureFmaMixBF16Insts : SubtargetFeature<"fma-mix-bf16-insts",
+ "HasFmaMixBF16Insts",
+ "true",
+ "Has v_fma_mix_f32_bf16, v_fma_mixlo_bf16, v_fma_mixhi_bf16 instructions"
+>;
+
def FeatureIEEEMinimumMaximumInsts : SubtargetFeature<"ieee-minimum-maximum-insts",
"HasIEEEMinimumMaximumInsts",
"true",
@@ -167,6 +173,12 @@ def FeatureMinimum3Maximum3F16 : SubtargetFeature<"minimum3-maximum3-f16",
"Has v_minimum3_f16 and v_maximum3_f16 instructions"
>;
+def FeatureMin3Max3PKF16 : SubtargetFeature<"min3-max3-pkf16",
+ "HasMin3Max3PKF16",
+ "true",
+ "Has v_pk_min3_num_f16 and v_pk_max3_num_f16 instructions"
+>;
+
def FeatureMinimum3Maximum3PKF16 : SubtargetFeature<"minimum3-maximum3-pkf16",
"HasMinimum3Maximum3PKF16",
"true",
@@ -256,12 +268,24 @@ def FeatureInstFwdPrefetchBug : SubtargetFeature<"inst-fwd-prefetch-bug",
"S_INST_PREFETCH instruction causes shader to hang"
>;
+def FeatureVmemPrefInsts : SubtargetFeature<"vmem-pref-insts",
+ "HasVmemPrefInsts",
+ "true",
+ "Has flat_prefect_b8 and global_prefetch_b8 instructions"
+>;
+
def FeatureSafeSmemPrefetch : SubtargetFeature<"safe-smem-prefetch",
"HasSafeSmemPrefetch",
"true",
"SMEM prefetches do not fail on illegal address"
>;
+def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch",
+ "HasSafeCUPrefetch",
+ "true",
+ "VMEM CU scope prefetches do not fail on illegal address"
+>;
+
def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard",
"HasVcmpxExecWARHazard",
"true",
@@ -559,6 +583,12 @@ def FeatureBF16ConversionInsts : SubtargetFeature<"bf16-cvt-insts",
"Has bf16 conversion instructions"
>;
+def FeatureBF16PackedInsts : SubtargetFeature<"bf16-pk-insts",
+ "HasBF16PackedInsts",
+ "true",
+ "Has bf16 packed instructions (fma, add, mul, max, min)"
+>;
+
def FeatureVOP3P : SubtargetFeature<"vop3p",
"HasVOP3PInsts",
"true",
@@ -1349,6 +1379,10 @@ def FeatureLshlAddU64Inst
: SubtargetFeature<"lshl-add-u64-inst", "HasLshlAddU64Inst", "true",
"Has v_lshl_add_u64 instruction">;
+def FeatureAddSubU64Insts
+ : SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true",
+ "Has v_add_u64 and v_sub_u64 instructions">;
+
def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts",
"HasVMemToLDSLoad",
"true",
@@ -1989,7 +2023,10 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureTransposeLoadF4F6Insts,
FeatureBF16TransInsts,
FeatureBF16ConversionInsts,
+ FeatureBF16PackedInsts,
FeatureCvtPkF16F32Inst,
+ FeatureFmaMixBF16Insts,
+ FeatureMin3Max3PKF16,
FeatureMinimum3Maximum3PKF16,
FeaturePrngInst,
FeaturePermlane16Swap,
@@ -2002,7 +2039,9 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureFlatBufferGlobalAtomicFaddF64Inst,
FeatureMemoryAtomicFAddF32DenormalSupport,
FeatureKernargPreload,
+ FeatureVmemPrefInsts,
FeatureLshlAddU64Inst,
+ FeatureAddSubU64Insts,
FeatureLdsBarrierArriveAtomic,
FeatureSetPrioIncWgInst,
]>;
@@ -2349,6 +2388,10 @@ def HasMinimum3Maximum3F16 :
Predicate<"Subtarget->hasMinimum3Maximum3F16()">,
AssemblerPredicate<(all_of FeatureMinimum3Maximum3F16)>;
+def HasMin3Max3PKF16 :
+ Predicate<"Subtarget->hasMin3Max3PKF16()">,
+ AssemblerPredicate<(all_of FeatureMin3Max3PKF16)>;
+
def HasMinimum3Maximum3PKF16 :
Predicate<"Subtarget->hasMinimum3Maximum3PKF16()">,
AssemblerPredicate<(all_of FeatureMinimum3Maximum3PKF16)>;
@@ -2472,6 +2515,9 @@ def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
def HasBF16ConversionInsts : Predicate<"Subtarget->hasBF16ConversionInsts()">,
AssemblerPredicate<(all_of FeatureBF16ConversionInsts)>;
+def HasBF16PackedInsts : Predicate<"Subtarget->hasBF16PackedInsts()">,
+ AssemblerPredicate<(all_of FeatureBF16PackedInsts)>;
+
def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
AssemblerPredicate<(all_of FeatureVOP3P)>;
@@ -2573,6 +2619,9 @@ def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">,
AssemblerPredicate<(all_of FeatureFmaMixInsts)>;
+def HasFmaMixBF16Insts : Predicate<"Subtarget->hasFmaMixBF16Insts()">,
+ AssemblerPredicate<(all_of FeatureFmaMixBF16Insts)>;
+
def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">,
AssemblerPredicate<(all_of FeatureDLInsts)>;
@@ -2771,12 +2820,18 @@ def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">;
def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">,
AssemblerPredicate<(all_of FeatureXF32Insts)>;
+def HasVmemPrefInsts : Predicate<"Subtarget->hasVmemPrefInsts()">,
+ AssemblerPredicate<(all_of FeatureVmemPrefInsts)>;
+
def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">,
AssemblerPredicate<(all_of FeatureAshrPkInsts)>;
def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">,
AssemblerPredicate<(all_of FeatureLshlAddU64Inst)>;
+def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">,
+ AssemblerPredicate<(all_of FeatureAddSubU64Insts)>;
+
def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">,
AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index dedee46..49d8b44 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1383,7 +1383,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
&AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
&AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
&AAUnderlyingObjects::ID, &AANoAliasAddrSpace::ID, &AAAddressSpace::ID,
- &AAIndirectCallInfo::ID, &AAInstanceInfo::ID});
+ &AAIndirectCallInfo::ID});
AttributorConfig AC(CGUpdater);
AC.IsClosedWorldModule = Options.IsClosedWorld;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 891d362..c01e5d3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -137,6 +137,9 @@ def gi_global_offset :
def gi_global_saddr :
GIComplexOperandMatcher<s64, "selectGlobalSAddr">,
GIComplexPatternEquiv<GlobalSAddr>;
+def gi_global_saddr_cpol :
+ GIComplexOperandMatcher<s64, "selectGlobalSAddrCPol">,
+ GIComplexPatternEquiv<GlobalSAddrCPol>;
def gi_global_saddr_glc :
GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">,
GIComplexPatternEquiv<GlobalSAddrGLC>;
@@ -446,5 +449,8 @@ def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">,
def gi_as_hw_round_mode : GICustomOperandRenderer<"renderRoundMode">,
GISDNodeXFormEquiv<as_hw_round_mode>;
+def gi_prefetch_loc : GICustomOperandRenderer<"renderPrefetchLoc">,
+ GISDNodeXFormEquiv<PrefetchLoc>;
+
def gi_MFMALdScaleModifierOp : GICustomOperandRenderer<"renderScaledMAIIntrinsicOperand">,
GISDNodeXFormEquiv<MFMALdScaleXForm>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index 00979f4..f36935d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -117,45 +117,72 @@ static LLT getReadAnyLaneSplitTy(LLT Ty) {
return LLT::scalar(32);
}
-static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
- const RegisterBankInfo &RBI);
-
-static void unmergeReadAnyLane(MachineIRBuilder &B,
- SmallVectorImpl<Register> &SgprDstParts,
- LLT UnmergeTy, Register VgprSrc,
- const RegisterBankInfo &RBI) {
+template <typename ReadLaneFnTy>
+static Register buildReadLane(MachineIRBuilder &, Register,
+ const RegisterBankInfo &, ReadLaneFnTy);
+
+template <typename ReadLaneFnTy>
+static void
+unmergeReadAnyLane(MachineIRBuilder &B, SmallVectorImpl<Register> &SgprDstParts,
+ LLT UnmergeTy, Register VgprSrc, const RegisterBankInfo &RBI,
+ ReadLaneFnTy BuildRL) {
const RegisterBank *VgprRB = &RBI.getRegBank(AMDGPU::VGPRRegBankID);
auto Unmerge = B.buildUnmerge({VgprRB, UnmergeTy}, VgprSrc);
for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) {
- SgprDstParts.push_back(buildReadAnyLane(B, Unmerge.getReg(i), RBI));
+ SgprDstParts.push_back(buildReadLane(B, Unmerge.getReg(i), RBI, BuildRL));
}
}
-static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
- const RegisterBankInfo &RBI) {
+template <typename ReadLaneFnTy>
+static Register buildReadLane(MachineIRBuilder &B, Register VgprSrc,
+ const RegisterBankInfo &RBI,
+ ReadLaneFnTy BuildRL) {
LLT Ty = B.getMRI()->getType(VgprSrc);
const RegisterBank *SgprRB = &RBI.getRegBank(AMDGPU::SGPRRegBankID);
if (Ty.getSizeInBits() == 32) {
- return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {{SgprRB, Ty}}, {VgprSrc})
- .getReg(0);
+ Register SgprDst = B.getMRI()->createVirtualRegister({SgprRB, Ty});
+ return BuildRL(B, SgprDst, VgprSrc).getReg(0);
}
SmallVector<Register, 8> SgprDstParts;
- unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);
+ unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI,
+ BuildRL);
return B.buildMergeLikeInstr({SgprRB, Ty}, SgprDstParts).getReg(0);
}
-void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst,
- Register VgprSrc, const RegisterBankInfo &RBI) {
+template <typename ReadLaneFnTy>
+static void buildReadLane(MachineIRBuilder &B, Register SgprDst,
+ Register VgprSrc, const RegisterBankInfo &RBI,
+ ReadLaneFnTy BuildReadLane) {
LLT Ty = B.getMRI()->getType(VgprSrc);
if (Ty.getSizeInBits() == 32) {
- B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc});
+ BuildReadLane(B, SgprDst, VgprSrc);
return;
}
SmallVector<Register, 8> SgprDstParts;
- unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);
+ unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI,
+ BuildReadLane);
B.buildMergeLikeInstr(SgprDst, SgprDstParts).getReg(0);
}
+
+void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst,
+ Register VgprSrc, const RegisterBankInfo &RBI) {
+ return buildReadLane(
+ B, SgprDst, VgprSrc, RBI,
+ [](MachineIRBuilder &B, Register SgprDst, Register VgprSrc) {
+ return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc});
+ });
+}
+
+void AMDGPU::buildReadFirstLane(MachineIRBuilder &B, Register SgprDst,
+ Register VgprSrc, const RegisterBankInfo &RBI) {
+ return buildReadLane(
+ B, SgprDst, VgprSrc, RBI,
+ [](MachineIRBuilder &B, Register SgprDst, Register VgprSrc) {
+ return B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, SgprDst)
+ .addReg(VgprSrc);
+ });
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
index 0c89bb5..5e1000e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
@@ -51,6 +51,8 @@ private:
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
const RegisterBankInfo &RBI);
+void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
+ const RegisterBankInfo &RBI);
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 5a2416de..dfaa145 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2020,6 +2020,22 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
+ SDValue &SAddr, SDValue &VOffset,
+ SDValue &Offset,
+ SDValue &CPol) const {
+ bool ScaleOffset;
+ if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
+ return false;
+
+ // We are assuming CPol is always the last operand of the intrinsic.
+ auto PassedCPol =
+ N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
+ CPol = CurDAG->getTargetConstant(
+ (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
+ return true;
+}
+
bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
SDValue &SAddr, SDValue &VOffset,
SDValue &Offset,
@@ -3861,58 +3877,114 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
return SelectVOP3Mods(In, Src, SrcMods);
}
+// Match lowered fpext from bf16 to f32. This is a bit operation extending
+// a 16-bit value with 16-bit of zeroes at LSB:
+//
+// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val)))))
+// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true
+// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false
+static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) {
+ if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST)
+ return SDValue();
+ Op = Op.getOperand(0);
+
+ IsExtractHigh = false;
+ if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) {
+ auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0));
+ if (!Low16 || !Low16->isZero())
+ return SDValue();
+ Op = stripBitcast(Op.getOperand(1));
+ if (Op.getValueType() != MVT::bf16)
+ return SDValue();
+ return Op;
+ }
+
+ if (Op.getValueType() != MVT::i32)
+ return SDValue();
+
+ if (Op.getOpcode() == ISD::AND) {
+ if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ if (Mask->getZExtValue() == 0xffff0000) {
+ IsExtractHigh = true;
+ return Op.getOperand(0);
+ }
+ }
+ return SDValue();
+ }
+
+ if (Op.getOpcode() == ISD::SHL) {
+ if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ if (Amt->getZExtValue() == 16)
+ return Op.getOperand(0);
+ }
+ }
+
+ return SDValue();
+}
+
// The return value is not whether the match is possible (which it always is),
// but whether or not it a conversion is really used.
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
- unsigned &Mods) const {
+ unsigned &Mods,
+ MVT VT) const {
Mods = 0;
SelectVOP3ModsImpl(In, Src, Mods);
+ bool IsExtractHigh = false;
if (Src.getOpcode() == ISD::FP_EXTEND) {
Src = Src.getOperand(0);
- assert(Src.getValueType() == MVT::f16);
- Src = stripBitcast(Src);
+ } else if (VT == MVT::bf16) {
+ SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh);
+ if (!B16)
+ return false;
+ Src = B16;
+ } else
+ return false;
- // Be careful about folding modifiers if we already have an abs. fneg is
- // applied last, so we don't want to apply an earlier fneg.
- if ((Mods & SISrcMods::ABS) == 0) {
- unsigned ModsTmp;
- SelectVOP3ModsImpl(Src, Src, ModsTmp);
+ if (Src.getValueType() != VT &&
+ (VT != MVT::bf16 || Src.getValueType() != MVT::i32))
+ return false;
- if ((ModsTmp & SISrcMods::NEG) != 0)
- Mods ^= SISrcMods::NEG;
+ Src = stripBitcast(Src);
- if ((ModsTmp & SISrcMods::ABS) != 0)
- Mods |= SISrcMods::ABS;
- }
+ // Be careful about folding modifiers if we already have an abs. fneg is
+ // applied last, so we don't want to apply an earlier fneg.
+ if ((Mods & SISrcMods::ABS) == 0) {
+ unsigned ModsTmp;
+ SelectVOP3ModsImpl(Src, Src, ModsTmp);
- // op_sel/op_sel_hi decide the source type and source.
- // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
- // If the sources's op_sel is set, it picks the high half of the source
- // register.
+ if ((ModsTmp & SISrcMods::NEG) != 0)
+ Mods ^= SISrcMods::NEG;
- Mods |= SISrcMods::OP_SEL_1;
- if (isExtractHiElt(Src, Src)) {
- Mods |= SISrcMods::OP_SEL_0;
+ if ((ModsTmp & SISrcMods::ABS) != 0)
+ Mods |= SISrcMods::ABS;
+ }
- // TODO: Should we try to look for neg/abs here?
- }
+ // op_sel/op_sel_hi decide the source type and source.
+ // If the source's op_sel_hi is set, it indicates to do a conversion from
+ // fp16. If the sources's op_sel is set, it picks the high half of the source
+ // register.
- // Prevent unnecessary subreg COPY to VGPR_16
- if (Src.getOpcode() == ISD::TRUNCATE &&
- Src.getOperand(0).getValueType() == MVT::i32) {
- Src = Src.getOperand(0);
- }
- return true;
+ Mods |= SISrcMods::OP_SEL_1;
+ if (IsExtractHigh ||
+ (Src.getValueSizeInBits() == 16 && isExtractHiElt(Src, Src))) {
+ Mods |= SISrcMods::OP_SEL_0;
+
+ // TODO: Should we try to look for neg/abs here?
}
- return false;
+ // Prevent unnecessary subreg COPY to VGPR_16
+ if (Src.getOpcode() == ISD::TRUNCATE &&
+ Src.getOperand(0).getValueType() == MVT::i32) {
+ Src = Src.getOperand(0);
+ }
+ return true;
}
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods = 0;
- if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
+ if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16))
return false;
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
@@ -3921,7 +3993,24 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods = 0;
- SelectVOP3PMadMixModsImpl(In, Src, Mods);
+ SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16);
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ unsigned Mods = 0;
+ if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16))
+ return false;
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ unsigned Mods = 0;
+ SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16);
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 6123d75..5636d89 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -19,6 +19,7 @@
#include "SIModeRegisterDefaults.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Target/TargetMachine.h"
namespace llvm {
@@ -167,6 +168,9 @@ private:
bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;
+ bool SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, SDValue &SAddr,
+ SDValue &VOffset, SDValue &Offset,
+ SDValue &CPol) const;
bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;
@@ -254,11 +258,15 @@ private:
bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
- bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
- unsigned &Mods) const;
+ bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods,
+ MVT VT) const;
bool SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
SDValue &SrcMods) const;
bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const;
+ bool SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const;
bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
SDValue &Tbl) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 877c3ac..266dee1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5774,6 +5774,16 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
+ const MachineInstr &I = *Root.getParent();
+
+ // We are assuming CPol is always the last operand of the intrinsic.
+ auto PassedCPol =
+ I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
+ return selectGlobalSAddr(Root, PassedCPol);
+}
+
+InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
}
@@ -7068,6 +7078,17 @@ void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
}
+void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ uint32_t V = MI.getOperand(2).getImm();
+ V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK))
+ << AMDGPU::CPol::SCOPE_SHIFT;
+ if (!Subtarget->hasSafeCUPrefetch())
+ V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
+ MIB.addImm(V);
+}
+
/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 5f7f05c..fe9743d0a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -261,6 +261,8 @@ private:
InstructionSelector::ComplexRendererFns
selectGlobalSAddr(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
+ selectGlobalSAddrCPol(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
selectGlobalSAddrGLC(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
@@ -414,6 +416,10 @@ private:
void renderRoundMode(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
+
+ void renderPrefetchLoc(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const;
+
void renderScaledMAIIntrinsicOperand(MachineInstrBuilder &MIB,
const MachineInstr &MI, int OpIdx) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e7bf88d..fedfa3f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4208,6 +4208,9 @@ bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
assert(Ty.isScalar());
unsigned Size = Ty.getSizeInBits();
+ if (ST.hasVectorMulU64() && Size == 64)
+ return true;
+
unsigned NumParts = Size / 32;
assert((Size % 32) == 0);
assert(NumParts >= 2);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
index ba66134..e187959 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -23,6 +23,8 @@
#include "GCNSubtarget.h"
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -115,126 +117,233 @@ public:
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {};
- bool isLaneMask(Register Reg) {
- const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
- if (RB && RB->getID() == AMDGPU::VCCRegBankID)
- return true;
+ bool isLaneMask(Register Reg);
+ std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode);
+ std::pair<GUnmerge *, int> tryMatchRALFromUnmerge(Register Src);
+ Register getReadAnyLaneSrc(Register Src);
+ void replaceRegWithOrBuildCopy(Register Dst, Register Src);
- const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
- return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
- }
+ bool tryEliminateReadAnyLane(MachineInstr &Copy);
+ void tryCombineCopy(MachineInstr &MI);
+ void tryCombineS1AnyExt(MachineInstr &MI);
+};
- void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) {
- MI.eraseFromParent();
- if (Optional0 && isTriviallyDead(*Optional0, MRI))
- Optional0->eraseFromParent();
- }
+bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) {
+ const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
+ if (RB && RB->getID() == AMDGPU::VCCRegBankID)
+ return true;
- std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode) {
- MachineInstr *MatchMI = MRI.getVRegDef(Src);
- if (MatchMI->getOpcode() != Opcode)
- return {nullptr, Register()};
- return {MatchMI, MatchMI->getOperand(1).getReg()};
- }
+ const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
+ return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
+}
- void tryCombineCopy(MachineInstr &MI) {
- Register Dst = MI.getOperand(0).getReg();
- Register Src = MI.getOperand(1).getReg();
- // Skip copies of physical registers.
- if (!Dst.isVirtual() || !Src.isVirtual())
- return;
-
- // This is a cross bank copy, sgpr S1 to lane mask.
- //
- // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32)
- // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1)
- // ->
- // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32)
- if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) {
- auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC);
- assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
- "sgpr S1 must be result of G_TRUNC of sgpr S32");
-
- B.setInstr(MI);
- // Ensure that truncated bits in BoolSrc are 0.
- auto One = B.buildConstant({SgprRB, S32}, 1);
- auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
- B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+std::pair<MachineInstr *, Register>
+AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) {
+ MachineInstr *MatchMI = MRI.getVRegDef(Src);
+ if (MatchMI->getOpcode() != Opcode)
+ return {nullptr, Register()};
+ return {MatchMI, MatchMI->getOperand(1).getReg()};
+}
+
+std::pair<GUnmerge *, int>
+AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) {
+ MachineInstr *ReadAnyLane = MRI.getVRegDef(Src);
+ if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE)
+ return {nullptr, -1};
+
+ Register RALSrc = ReadAnyLane->getOperand(1).getReg();
+ if (auto *UnMerge = getOpcodeDef<GUnmerge>(RALSrc, MRI))
+ return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)};
- // Src = G_AMDGPU_READANYLANE RALSrc
- // Dst = COPY Src
- // ->
- // Dst = RALSrc
- if (MRI.getRegBankOrNull(Dst) == VgprRB &&
- MRI.getRegBankOrNull(Src) == SgprRB) {
- auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
- if (!RAL)
- return;
-
- assert(MRI.getRegBank(RALSrc) == VgprRB);
- MRI.replaceRegWith(Dst, RALSrc);
- cleanUpAfterCombine(MI, RAL);
- return;
+ return {nullptr, -1};
+}
+
+Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
+ // Src = G_AMDGPU_READANYLANE RALSrc
+ auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
+ if (RAL)
+ return RALSrc;
+
+ // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
+ // LoSgpr = G_AMDGPU_READANYLANE LoVgpr
+ // HiSgpr = G_AMDGPU_READANYLANE HiVgpr
+ // Src G_MERGE_VALUES LoSgpr, HiSgpr
+ auto *Merge = getOpcodeDef<GMergeLikeInstr>(Src, MRI);
+ if (Merge) {
+ unsigned NumElts = Merge->getNumSources();
+ auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0));
+ if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0)
+ return {};
+
+ // Check if all elements are from same unmerge and there is no shuffling.
+ for (unsigned i = 1; i < NumElts; ++i) {
+ auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i));
+ if (UnmergeI != Unmerge || (unsigned)IdxI != i)
+ return {};
}
+ return Unmerge->getSourceReg();
}
- void tryCombineS1AnyExt(MachineInstr &MI) {
- // %Src:sgpr(S1) = G_TRUNC %TruncSrc
- // %Dst = G_ANYEXT %Src:sgpr(S1)
- // ->
- // %Dst = G_... %TruncSrc
- Register Dst = MI.getOperand(0).getReg();
- Register Src = MI.getOperand(1).getReg();
- if (MRI.getType(Src) != S1)
- return;
-
- auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC);
- if (!Trunc)
- return;
-
- LLT DstTy = MRI.getType(Dst);
- LLT TruncSrcTy = MRI.getType(TruncSrc);
-
- if (DstTy == TruncSrcTy) {
- MRI.replaceRegWith(Dst, TruncSrc);
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+ // SrcRegIdx = G_AMDGPU_READANYLANE RALElSrc
+ // SourceReg G_MERGE_VALUES ..., SrcRegIdx, ...
+ // ..., Src, ... = G_UNMERGE_VALUES SourceReg
+ auto *UnMerge = getOpcodeDef<GUnmerge>(Src, MRI);
+ if (!UnMerge)
+ return {};
+
+ int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr);
+ Merge = getOpcodeDef<GMergeLikeInstr>(UnMerge->getSourceReg(), MRI);
+ if (!Merge || UnMerge->getNumDefs() != Merge->getNumSources())
+ return {};
+
+ Register SrcRegIdx = Merge->getSourceReg(Idx);
+ if (MRI.getType(Src) != MRI.getType(SrcRegIdx))
+ return {};
+
+ auto [RALEl, RALElSrc] = tryMatch(SrcRegIdx, AMDGPU::G_AMDGPU_READANYLANE);
+ if (RALEl)
+ return RALElSrc;
+
+ return {};
+}
+
+void AMDGPURegBankLegalizeCombiner::replaceRegWithOrBuildCopy(Register Dst,
+ Register Src) {
+ if (Dst.isVirtual())
+ MRI.replaceRegWith(Dst, Src);
+ else
+ B.buildCopy(Dst, Src);
+}
+
+bool AMDGPURegBankLegalizeCombiner::tryEliminateReadAnyLane(
+ MachineInstr &Copy) {
+ Register Dst = Copy.getOperand(0).getReg();
+ Register Src = Copy.getOperand(1).getReg();
+
+ // Skip non-vgpr Dst
+ if (Dst.isVirtual() ? (MRI.getRegBankOrNull(Dst) != VgprRB)
+ : !TRI.isVGPR(MRI, Dst))
+ return false;
+
+ // Skip physical source registers and source registers with register class
+ if (!Src.isVirtual() || MRI.getRegClassOrNull(Src))
+ return false;
+
+ Register RALDst = Src;
+ MachineInstr &SrcMI = *MRI.getVRegDef(Src);
+ if (SrcMI.getOpcode() == AMDGPU::G_BITCAST)
+ RALDst = SrcMI.getOperand(1).getReg();
+
+ Register RALSrc = getReadAnyLaneSrc(RALDst);
+ if (!RALSrc)
+ return false;
+
+ B.setInstr(Copy);
+ if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) {
+ // Src = READANYLANE RALSrc Src = READANYLANE RALSrc
+ // Dst = Copy Src $Dst = Copy Src
+ // -> ->
+ // Dst = RALSrc $Dst = Copy RALSrc
+ replaceRegWithOrBuildCopy(Dst, RALSrc);
+ } else {
+ // RALDst = READANYLANE RALSrc RALDst = READANYLANE RALSrc
+ // Src = G_BITCAST RALDst Src = G_BITCAST RALDst
+ // Dst = Copy Src Dst = Copy Src
+ // -> ->
+ // NewVgpr = G_BITCAST RALDst NewVgpr = G_BITCAST RALDst
+ // Dst = NewVgpr $Dst = Copy NewVgpr
+ auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc);
+ replaceRegWithOrBuildCopy(Dst, Bitcast.getReg(0));
+ }
+
+ eraseInstr(Copy, MRI);
+ return true;
+}
+
+void AMDGPURegBankLegalizeCombiner::tryCombineCopy(MachineInstr &MI) {
+ if (tryEliminateReadAnyLane(MI))
+ return;
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ // Skip copies of physical registers.
+ if (!Dst.isVirtual() || !Src.isVirtual())
+ return;
+
+ // This is a cross bank copy, sgpr S1 to lane mask.
+ //
+ // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32)
+ // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1)
+ // ->
+ // %BoolSrc:sgpr(s32) = G_AND %TruncS32Src:sgpr(s32), 1
+ // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %BoolSrc:sgpr(s32)
+ if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) {
+ auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC);
+ assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
+ "sgpr S1 must be result of G_TRUNC of sgpr S32");
B.setInstr(MI);
+ // Ensure that truncated bits in BoolSrc are 0.
+ auto One = B.buildConstant({SgprRB, S32}, 1);
+ auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
+ B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
+ eraseInstr(MI, MRI);
+ }
+}
- if (DstTy == S32 && TruncSrcTy == S64) {
- auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc);
- MRI.replaceRegWith(Dst, Unmerge.getReg(0));
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+void AMDGPURegBankLegalizeCombiner::tryCombineS1AnyExt(MachineInstr &MI) {
+ // %Src:sgpr(S1) = G_TRUNC %TruncSrc
+ // %Dst = G_ANYEXT %Src:sgpr(S1)
+ // ->
+ // %Dst = G_... %TruncSrc
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ if (MRI.getType(Src) != S1)
+ return;
+
+ auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC);
+ if (!Trunc)
+ return;
+
+ LLT DstTy = MRI.getType(Dst);
+ LLT TruncSrcTy = MRI.getType(TruncSrc);
+
+ if (DstTy == TruncSrcTy) {
+ MRI.replaceRegWith(Dst, TruncSrc);
+ eraseInstr(MI, MRI);
+ return;
+ }
- if (DstTy == S64 && TruncSrcTy == S32) {
- B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
- {TruncSrc, B.buildUndef({SgprRB, S32})});
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+ B.setInstr(MI);
- if (DstTy == S32 && TruncSrcTy == S16) {
- B.buildAnyExt(Dst, TruncSrc);
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+ if (DstTy == S32 && TruncSrcTy == S64) {
+ auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc);
+ MRI.replaceRegWith(Dst, Unmerge.getReg(0));
+ eraseInstr(MI, MRI);
+ return;
+ }
- if (DstTy == S16 && TruncSrcTy == S32) {
- B.buildTrunc(Dst, TruncSrc);
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+ if (DstTy == S64 && TruncSrcTy == S32) {
+ B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
+ {TruncSrc, B.buildUndef({SgprRB, S32})});
+ eraseInstr(MI, MRI);
+ return;
+ }
- llvm_unreachable("missing anyext + trunc combine");
+ if (DstTy == S32 && TruncSrcTy == S16) {
+ B.buildAnyExt(Dst, TruncSrc);
+ eraseInstr(MI, MRI);
+ return;
}
-};
+
+ if (DstTy == S16 && TruncSrcTy == S32) {
+ B.buildTrunc(Dst, TruncSrc);
+ eraseInstr(MI, MRI);
+ return;
+ }
+
+ llvm_unreachable("missing anyext + trunc combine");
+}
// Search through MRI for virtual registers with sgpr register bank and S1 LLT.
[[maybe_unused]] static Register getAnySgprS1(const MachineRegisterInfo &MRI) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 411159c..f471881 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -33,7 +33,7 @@ RegBankLegalizeHelper::RegBankLegalizeHelper(
MachineIRBuilder &B, const MachineUniformityInfo &MUI,
const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
: ST(B.getMF().getSubtarget<GCNSubtarget>()), B(B), MRI(*B.getMRI()),
- MUI(MUI), RBI(RBI), RBLRules(RBLRules),
+ MUI(MUI), RBI(RBI), RBLRules(RBLRules), IsWave32(ST.isWave32()),
SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
@@ -56,6 +56,224 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
lower(MI, Mapping, WaterfallSgprs);
}
+bool RegBankLegalizeHelper::executeInWaterfallLoop(
+ MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
+ SmallSet<Register, 4> &SGPROperandRegs) {
+ // Track use registers which have already been expanded with a readfirstlane
+ // sequence. This may have multiple uses if moving a sequence.
+ DenseMap<Register, Register> WaterfalledRegMap;
+
+ MachineBasicBlock &MBB = B.getMBB();
+ MachineFunction &MF = B.getMF();
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
+ unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
+ if (IsWave32) {
+ MovExecOpc = AMDGPU::S_MOV_B32;
+ MovExecTermOpc = AMDGPU::S_MOV_B32_term;
+ XorTermOpc = AMDGPU::S_XOR_B32_term;
+ AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
+ ExecReg = AMDGPU::EXEC_LO;
+ } else {
+ MovExecOpc = AMDGPU::S_MOV_B64;
+ MovExecTermOpc = AMDGPU::S_MOV_B64_term;
+ XorTermOpc = AMDGPU::S_XOR_B64_term;
+ AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
+ ExecReg = AMDGPU::EXEC;
+ }
+
+#ifndef NDEBUG
+ const int OrigRangeSize = std::distance(Range.begin(), Range.end());
+#endif
+
+ MachineRegisterInfo &MRI = *B.getMRI();
+ Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
+ Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
+
+ // Don't bother using generic instructions/registers for the exec mask.
+ B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
+
+ Register SavedExec = MRI.createVirtualRegister(WaveRC);
+
+ // To insert the loop we need to split the block. Move everything before
+ // this point to a new block, and insert a new empty block before this
+ // instruction.
+ MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
+ MachineFunction::iterator MBBI(MBB);
+ ++MBBI;
+ MF.insert(MBBI, LoopBB);
+ MF.insert(MBBI, BodyBB);
+ MF.insert(MBBI, RestoreExecBB);
+ MF.insert(MBBI, RemainderBB);
+
+ LoopBB->addSuccessor(BodyBB);
+ BodyBB->addSuccessor(RestoreExecBB);
+ BodyBB->addSuccessor(LoopBB);
+
+ // Move the rest of the block into a new block.
+ RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
+ RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
+
+ MBB.addSuccessor(LoopBB);
+ RestoreExecBB->addSuccessor(RemainderBB);
+
+ B.setInsertPt(*LoopBB, LoopBB->end());
+
+ // +-MBB:------------+
+ // | ... |
+ // | %0 = G_INST_1 |
+ // | %Dst = MI %Vgpr |
+ // | %1 = G_INST_2 |
+ // | ... |
+ // +-----------------+
+ // ->
+ // +-MBB-------------------------------+
+ // | ... |
+ // | %0 = G_INST_1 |
+ // | %SaveExecReg = S_MOV_B32 $exec_lo |
+ // +----------------|------------------+
+ // | /------------------------------|
+ // V V |
+ // +-LoopBB---------------------------------------------------------------+ |
+ // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
+ // | instead of executing for each lane, see if other lanes had | |
+ // | same value for %Vgpr and execute for them also. | |
+ // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
+ // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
+ // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
+ // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
+ // +----------------|-----------------------------------------------------+ |
+ // V |
+ // +-BodyBB------------------------------------------------------------+ |
+ // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
+ // | executed only for active lanes and written to Dst | |
+ // | $exec = S_XOR_B32 $exec, %SavedExec | |
+ // | set active lanes to 0 in SavedExec, lanes that did not write to | |
+ // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
+ // | SI_WATERFALL_LOOP LoopBB |-----|
+ // +----------------|--------------------------------------------------+
+ // V
+ // +-RestoreExecBB--------------------------+
+ // | $exec_lo = S_MOV_B32_term %SaveExecReg |
+ // +----------------|-----------------------+
+ // V
+ // +-RemainderBB:----------------------+
+ // | %1 = G_INST_2 |
+ // | ... |
+ // +---------------------------------- +
+
+ // Move the instruction into the loop body. Note we moved everything after
+ // Range.end() already into a new block, so Range.end() is no longer valid.
+ BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
+
+ // Figure out the iterator range after splicing the instructions.
+ MachineBasicBlock::iterator NewBegin = Range.begin()->getIterator();
+ auto NewEnd = BodyBB->end();
+ assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
+
+ B.setMBB(*LoopBB);
+ Register CondReg;
+
+ for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
+ for (MachineOperand &Op : MI.all_uses()) {
+ Register OldReg = Op.getReg();
+ if (!SGPROperandRegs.count(OldReg))
+ continue;
+
+ // See if we already processed this register in another instruction in
+ // the sequence.
+ auto OldVal = WaterfalledRegMap.find(OldReg);
+ if (OldVal != WaterfalledRegMap.end()) {
+ Op.setReg(OldVal->second);
+ continue;
+ }
+
+ Register OpReg = Op.getReg();
+ LLT OpTy = MRI.getType(OpReg);
+
+ // TODO: support for agpr
+ assert(MRI.getRegBank(OpReg) == VgprRB);
+ Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
+ buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
+
+ // Build the comparison(s), CurrentLaneReg == OpReg.
+ unsigned OpSize = OpTy.getSizeInBits();
+ unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
+ LLT PartTy = LLT::scalar(PartSize);
+ unsigned NumParts = OpSize / PartSize;
+ SmallVector<Register, 8> OpParts;
+ SmallVector<Register, 8> CurrentLaneParts;
+
+ if (NumParts == 1) {
+ OpParts.push_back(OpReg);
+ CurrentLaneParts.push_back(CurrentLaneReg);
+ } else {
+ auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
+ auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
+ for (unsigned i = 0; i < NumParts; ++i) {
+ OpParts.push_back(UnmergeOp.getReg(i));
+ CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
+ }
+ }
+
+ for (unsigned i = 0; i < NumParts; ++i) {
+ Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
+ B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
+
+ if (!CondReg)
+ CondReg = CmpReg;
+ else
+ CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
+ }
+
+ Op.setReg(CurrentLaneReg);
+
+ // Make sure we don't re-process this register again.
+ WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
+ }
+ }
+
+ // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
+ Register CondRegLM =
+ MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
+ B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
+
+ // Update EXEC, save the original EXEC value to SavedExec.
+ B.buildInstr(AndSaveExecOpc)
+ .addDef(SavedExec)
+ .addReg(CondRegLM, RegState::Kill);
+ MRI.setSimpleHint(SavedExec, CondRegLM);
+
+ B.setInsertPt(*BodyBB, BodyBB->end());
+
+ // Update EXEC, switch all done bits to 0 and all todo bits to 1.
+ B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
+
+ // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
+ // s_cbranch_scc0?
+
+ // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
+ B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
+
+ // Save the EXEC mask before the loop.
+ B.setInsertPt(MBB, MBB.end());
+ B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
+
+ // Restore the EXEC mask after the loop.
+ B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
+ B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
+
+ // Set the insert point after the original instruction, so any new
+ // instructions will be in the remainder.
+ B.setInsertPt(*RemainderBB, RemainderBB->begin());
+
+ return true;
+}
+
void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
MachineFunction &MF = B.getMF();
@@ -391,7 +609,7 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
switch (Mapping.LoweringMethod) {
case DoNotLower:
- return;
+ break;
case VccExtToSel:
return lowerVccExtToSel(MI);
case UniExtToSel: {
@@ -527,7 +745,10 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
}
}
- // TODO: executeInWaterfallLoop(... WaterfallSgprs)
+ if (!WaterfallSgprs.empty()) {
+ MachineBasicBlock::iterator I = MI.getIterator();
+ executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs);
+ }
}
LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
@@ -539,6 +760,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
case Vgpr16:
return LLT::scalar(16);
case Sgpr32:
+ case Sgpr32_WF:
case Sgpr32Trunc:
case Sgpr32AExt:
case Sgpr32AExtBoolInReg:
@@ -577,6 +799,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
case VgprV2S32:
return LLT::fixed_vector(2, 32);
case SgprV4S32:
+ case SgprV4S32_WF:
case VgprV4S32:
case UniInVgprV4S32:
return LLT::fixed_vector(4, 32);
@@ -650,6 +873,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
return VccRB;
case Sgpr16:
case Sgpr32:
+ case Sgpr32_WF:
case Sgpr64:
case Sgpr128:
case SgprP1:
@@ -662,6 +886,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case SgprV2S16:
case SgprV2S32:
case SgprV4S32:
+ case SgprV4S32_WF:
case SgprB32:
case SgprB64:
case SgprB96:
@@ -923,6 +1148,14 @@ void RegBankLegalizeHelper::applyMappingSrc(
}
break;
}
+ // sgpr waterfall, scalars and vectors
+ case Sgpr32_WF:
+ case SgprV4S32_WF: {
+ assert(Ty == getTyFromID(MethodIDs[i]));
+ if (RB != SgprRB)
+ SgprWaterfallOperandRegs.insert(Reg);
+ break;
+ }
// sgpr and vgpr scalars with extend
case Sgpr32AExt: {
// Note: this ext allows S1, and it is meant to be combined away.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index 08cc7d4..db965d8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -32,6 +32,7 @@ class RegBankLegalizeHelper {
const MachineUniformityInfo &MUI;
const RegisterBankInfo &RBI;
const RegBankLegalizeRules &RBLRules;
+ const bool IsWave32;
const RegisterBank *SgprRB;
const RegisterBank *VgprRB;
const RegisterBank *VccRB;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index a60855c..5a6ad40 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -529,7 +529,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
addRulesForGOpcs({G_ICMP})
.Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
- .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}});
+ .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
+ .Any({{DivS1, _, S64}, {{Vcc}, {None, Vgpr64, Vgpr64}}});
addRulesForGOpcs({G_FCMP})
.Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}})
@@ -666,11 +667,15 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
// clang-format off
addRulesForGOpcs({G_LOAD})
.Any({{DivB32, DivP0}, {{VgprB32}, {VgprP0}}})
+ .Any({{DivB32, UniP0}, {{VgprB32}, {VgprP0}}})
.Any({{DivB32, DivP1}, {{VgprB32}, {VgprP1}}})
.Any({{{UniB256, UniP1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}})
.Any({{{UniB512, UniP1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}})
.Any({{{UniB32, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}})
+ .Any({{{UniB64, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP1}}})
+ .Any({{{UniB96, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP1}}})
+ .Any({{{UniB128, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP1}}})
.Any({{{UniB256, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP1}, SplitLoad}})
.Any({{{UniB512, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP1}, SplitLoad}})
@@ -684,6 +689,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{{UniB96, UniP4}, isAlign16 && isUL}, {{SgprB96}, {SgprP4}, WidenLoad}}, !hasUnalignedLoads)
.Any({{{UniB96, UniP4}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP4}, SplitLoad}}, !hasUnalignedLoads)
.Any({{{UniB96, UniP4}, isAlign4 && isUL}, {{SgprB96}, {SgprP4}}}, hasUnalignedLoads)
+ .Any({{{UniB128, UniP4}, isAlign4 && isUL}, {{SgprB128}, {SgprP4}}})
.Any({{{UniB256, UniP4}, isAlign4 && isUL}, {{SgprB256}, {SgprP4}}})
.Any({{{UniB512, UniP4}, isAlign4 && isUL}, {{SgprB512}, {SgprP4}}})
.Any({{{UniB32, UniP4}, !isNaturalAlignedSmall || !isUL}, {{UniInVgprB32}, {VgprP4}}}, hasSMRDSmall) // i8 and i16 load
@@ -698,11 +704,15 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{{UniB32, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP4}}});
// clang-format on
- addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, Vector)
- .Div(S32, {{Vgpr32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
- .Uni(S32, {{UniInVgprS32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
- .Div(V4S32, {{VgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
- .Uni(V4S32, {{UniInVgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}});
+ addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, StandardB)
+ .Div(B32, {{VgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B32, {{UniInVgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Div(B64, {{VgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B64, {{UniInVgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Div(B96, {{VgprB96}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B96, {{UniInVgprB96}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Div(B128, {{VgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B128, {{UniInVgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}});
addRulesForGOpcs({G_STORE})
.Any({{S32, P0}, {{}, {Vgpr32, VgprP0}}})
@@ -716,7 +726,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
addRulesForGOpcs({G_PTR_ADD})
.Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}})
.Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}})
- .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}});
+ .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}})
+ .Any({{UniP4}, {{SgprP4}, {SgprP4, Sgpr64}}});
addRulesForGOpcs({G_INTTOPTR})
.Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}})
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index 7243d75..1391440 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -188,7 +188,11 @@ enum RegBankLLTMappingApplyID {
Sgpr32Trunc,
- // Src only modifiers: waterfalls, extends
+ // Src only modifiers: execute in waterfall loop if divergent
+ Sgpr32_WF,
+ SgprV4S32_WF,
+
+ // Src only modifiers: extends
Sgpr32AExt,
Sgpr32AExtBoolInReg,
Sgpr32SExt,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index f1caf24..c5a1d9e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2528,7 +2528,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// Special case for s_mul_u64. There is not a vector equivalent of
// s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
// multiplications.
- if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) {
+ if (!Subtarget.hasVectorMulU64() && Opc == AMDGPU::G_MUL &&
+ DstTy.getSizeInBits() == 64) {
applyMappingSMULU64(B, OpdMapper);
return;
}
@@ -3500,19 +3501,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
applyMappingMAD_64_32(B, OpdMapper);
return;
case AMDGPU::G_PREFETCH: {
- if (!Subtarget.hasPrefetch() || !Subtarget.hasSafeSmemPrefetch()) {
+ if (!Subtarget.hasSafeSmemPrefetch() && !Subtarget.hasVmemPrefInsts()) {
MI.eraseFromParent();
return;
}
Register PtrReg = MI.getOperand(0).getReg();
unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID);
- if (PtrBank == AMDGPU::VGPRRegBankID) {
+ if (PtrBank == AMDGPU::VGPRRegBankID &&
+ (!Subtarget.hasVmemPrefInsts() || !MI.getOperand(3).getImm())) {
+ // Cannot do I$ prefetch with divergent pointer.
MI.eraseFromParent();
return;
}
unsigned AS = MRI.getType(PtrReg).getAddressSpace();
- if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
- AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+ if ((!AMDGPU::isFlatGlobalAddrSpace(AS) &&
+ AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
+ (!Subtarget.hasSafeSmemPrefetch() &&
+ (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
+ !MI.getOperand(3).getImm() /* I$ prefetch */))) {
MI.eraseFromParent();
return;
}
@@ -3973,7 +3979,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
} else {
- OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
+ if (MI.getOpcode() == AMDGPU::G_MUL && Subtarget.hasVectorMulU64())
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ else
+ OpdsMapping[0] =
+ getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
@@ -5170,6 +5180,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_ds_load_tr16_b128:
case Intrinsic::amdgcn_ds_load_tr4_b64:
case Intrinsic::amdgcn_ds_load_tr6_b96:
+ case Intrinsic::amdgcn_flat_load_monitor_b32:
+ case Intrinsic::amdgcn_flat_load_monitor_b64:
+ case Intrinsic::amdgcn_flat_load_monitor_b128:
+ case Intrinsic::amdgcn_global_load_monitor_b32:
+ case Intrinsic::amdgcn_global_load_monitor_b64:
+ case Intrinsic::amdgcn_global_load_monitor_b128:
case Intrinsic::amdgcn_ds_read_tr4_b64:
case Intrinsic::amdgcn_ds_read_tr6_b96:
case Intrinsic::amdgcn_ds_read_tr8_b64:
@@ -5432,6 +5448,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
break;
}
+ case Intrinsic::amdgcn_flat_prefetch:
+ case Intrinsic::amdgcn_global_prefetch:
+ return getDefaultMappingVOP(MI);
default:
return getInvalidInstructionMapping();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index a8e1967..f580f43 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -159,7 +159,8 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
// If the inputs are tied and the same register, we can shortcut and
// directly replace the register.
- if (Src2->getReg() != CopySrcReg) {
+ if (!Src2->isReg() || Src2->getReg() != CopySrcReg ||
+ Src2->getSubReg() != DefMI->getOperand(1).getSubReg()) {
LLVM_DEBUG(
dbgs()
<< "Replacing untied VGPR MFMAs with AGPR form not yet handled\n");
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 1e44be8..6878744 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -61,6 +61,7 @@ protected:
bool EnableRealTrue16Insts = false;
bool HasBF16TransInsts = false;
bool HasBF16ConversionInsts = false;
+ bool HasBF16PackedInsts = false;
bool HasMadMixInsts = false;
bool HasMadMacF32Insts = false;
bool HasDsSrc2Insts = false;
@@ -209,6 +210,8 @@ public:
return HasBF16ConversionInsts;
}
+ bool hasBF16PackedInsts() const { return HasBF16PackedInsts; }
+
bool hasMadMixInsts() const {
return HasMadMixInsts;
}
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 679c55d..7207c25 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -13,6 +13,7 @@ let WantsRoot = true in {
def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>;
def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>;
+ def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>;
def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [], -10>;
def ScratchSVAddr : ComplexPattern<iPTR, 4, "SelectScratchSVAddr", [], [], -10>;
}
@@ -464,6 +465,37 @@ class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = n
let sve = 0;
}
+class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64:$vaddr), string asm = " $vaddr"> :
+ FLAT_Pseudo<opName, (outs), !con(addr, (ins flat_offset:$offset, CPol_0:$cpol)), asm#"$offset$cpol"> {
+ let has_vdst = 0;
+ let has_data = 0;
+ let mayLoad = 1;
+ let mayStore = 1;
+ let VM_CNT = 0;
+ let LGKM_CNT = 0;
+}
+
+multiclass FLAT_Flat_Prefetch_Pseudo<string opName> {
+ def "" : FLAT_Prefetch_Pseudo<opName>,
+ GlobalSaddrTable<0, opName>;
+ def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">,
+ GlobalSaddrTable<1, opName> {
+ let OtherPredicates = [HasFlatGVSMode];
+ let enabled_saddr = 1;
+ }
+}
+
+multiclass FLAT_Global_Prefetch_Pseudo<string opName> {
+ let is_flat_global = 1, has_saddr = 1 in {
+ def "" : FLAT_Prefetch_Pseudo<opName, (ins VReg_64:$vaddr), " $vaddr, off">,
+ GlobalSaddrTable<0, opName>;
+ def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">,
+ GlobalSaddrTable<1, opName> {
+ let enabled_saddr = 1;
+ }
+ }
+}
+
class FlatScratchInst <string sv_op, string mode> {
string SVOp = sv_op;
string Mode = mode;
@@ -1162,6 +1194,16 @@ defm SCRATCH_LOAD_LDS_USHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_u
defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sshort">;
defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_dword">;
+let SubtargetPredicate = isGFX125xOnly in {
+defm FLAT_LOAD_MONITOR_B32 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b32", VGPR_32>;
+defm FLAT_LOAD_MONITOR_B64 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b64", VReg_64>;
+defm FLAT_LOAD_MONITOR_B128 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b128", VReg_128>;
+
+defm GLOBAL_LOAD_MONITOR_B32 : FLAT_Global_Load_Pseudo <"global_load_monitor_b32", VGPR_32>;
+defm GLOBAL_LOAD_MONITOR_B64 : FLAT_Global_Load_Pseudo <"global_load_monitor_b64", VReg_64>;
+defm GLOBAL_LOAD_MONITOR_B128 : FLAT_Global_Load_Pseudo <"global_load_monitor_b128", VReg_128>;
+} // End SubtargetPredicate = isGFX125xOnly
+
let SubtargetPredicate = isGFX12Plus in {
let Uses = [EXEC, M0] in {
defm GLOBAL_LOAD_BLOCK : FLAT_Global_Load_Pseudo <"global_load_block", VReg_1024>;
@@ -1218,6 +1260,11 @@ let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in
"global_atomic_pk_add_f16", VGPR_32, v2f16
>;
+let SubtargetPredicate = HasVmemPrefInsts in {
+ defm FLAT_PREFETCH_B8 : FLAT_Flat_Prefetch_Pseudo<"flat_prefetch_b8">;
+ defm GLOBAL_PREFETCH_B8 : FLAT_Global_Prefetch_Pseudo<"global_prefetch_b8">;
+}
+
//===----------------------------------------------------------------------===//
// Flat Patterns
//===----------------------------------------------------------------------===//
@@ -1228,6 +1275,11 @@ class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCN
(inst $vaddr, $offset)
>;
+class FlatLoadPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (FlatOffset i64:$vaddr, i32:$offset), (i32 timm:$cpol))),
+ (inst $vaddr, $offset, $cpol)
+>;
+
class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(node (FlatOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in),
(inst $vaddr, $offset, 0, $in)
@@ -1249,8 +1301,8 @@ class FlatSignedLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Value
>;
class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)),
- (inst $saddr, $voffset, $offset, 0, $in)
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)),
+ (inst $saddr, $voffset, $offset, $cpol, $in)
>;
class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -1264,8 +1316,8 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT
>;
class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
- (inst $saddr, $voffset, $offset, (i32 0))
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+ (inst $saddr, $voffset, $offset, $cpol)
>;
class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -1278,6 +1330,16 @@ class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
(inst $saddr, $voffset, $offset, $cpol)
>;
+class FlatLoadSignedPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol))),
+ (inst $vaddr, $offset, $cpol)
+>;
+
+class GlobalLoadSaddrPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalSAddrCPol (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm))),
+ (inst $saddr, $voffset, $offset, $cpol)
+>;
+
class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat <
(node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol)),
@@ -1459,8 +1521,8 @@ class ScratchLoadSVaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT
>;
class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset))),
- (inst $vaddr, $saddr, $offset, 0)
+ (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
+ (inst $vaddr, $saddr, $offset, $cpol)
>;
multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
@@ -1473,6 +1535,16 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
}
}
+multiclass GlobalFLATLoadPats_CPOL<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatLoadSignedPat_CPOL<inst, node, vt> {
+ let AddedComplexity = 10;
+ }
+
+ def : GlobalLoadSaddrPat_CPOL<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 11;
+ }
+}
+
multiclass GlobalFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatSignedLoadPat_D16 <inst, node, vt> {
let AddedComplexity = 10;
@@ -2009,6 +2081,16 @@ let WaveSizePredicate = isWave32, OtherPredicates = [HasTransposeLoadF4F6Insts]
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR6_B96, int_amdgcn_global_load_tr6_b96, v3i32>;
}
+let OtherPredicates = [isGFX125xOnly] in {
+ def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B32, int_amdgcn_flat_load_monitor_b32, i32>;
+ def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B64, int_amdgcn_flat_load_monitor_b64, v2i32>;
+ def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B128, int_amdgcn_flat_load_monitor_b128, v4i32>;
+
+ defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B32, int_amdgcn_global_load_monitor_b32, i32>;
+ defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B64, int_amdgcn_global_load_monitor_b64, v2i32>;
+ defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B128, int_amdgcn_global_load_monitor_b128, v4i32>;
+} // End SubtargetPredicate = isGFX125xOnly
+
let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
@@ -2138,6 +2220,77 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f
} // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch]
+def PrefetchLoc: SDNodeXForm<timm, [{
+ uint32_t V = N->getZExtValue();
+ V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK)) << AMDGPU::CPol::SCOPE_SHIFT;
+ if (!Subtarget->hasSafeCUPrefetch())
+ V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
+ return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32);
+}]>;
+
+def prefetch_flat : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
+ (prefetch node:$ptr, node:$rw, node:$loc, node:$type),
+ [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; }]> {
+ let GISelPredicateCode = [{
+ return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
+ }];
+}
+
+def prefetch_global : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
+ (prefetch node:$ptr, node:$rw, node:$loc, node:$type),
+ [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+ (cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+ !Subtarget->hasSafeSmemPrefetch()); }]> {
+ let GISelPredicateCode = [{
+ return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+ ((*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+ !Subtarget->hasSafeSmemPrefetch());
+ }];
+}
+
+multiclass FlatPrefetchPats<string inst, SDPatternOperator prefetch_kind, SDPatternOperator rw> {
+ def : GCNPat <
+ (prefetch_kind (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), rw, (i32 timm:$loc), i32imm_one),
+ (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, (i32 (PrefetchLoc $loc)))
+ > {
+ let AddedComplexity = !if(!eq(rw, i32imm_zero), 0, 25);
+ }
+
+ def : GCNPat <
+ (prefetch_kind (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), rw, (i32 timm:$loc), i32imm_one),
+ (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, (i32 (PrefetchLoc $loc)))
+ > {
+ let AddedComplexity = !if(!eq(rw, i32imm_zero), 11, 30);
+ }
+}
+
+multiclass FlatIntrPrefetchPats<string inst, SDPatternOperator intr> {
+ def : GCNPat <
+ (intr (FlatOffset i64:$vaddr, i32:$offset), timm:$cpol),
+ (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, $cpol)
+ >;
+
+ def : GCNPat <
+ (intr (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), timm:$cpol),
+ (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, $cpol)> {
+ let AddedComplexity = 11;
+ }
+}
+
+let SubtargetPredicate = HasVmemPrefInsts in {
+ defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_zero>;
+ defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_zero>;
+
+ // Patterns for forced vector prefetch with rw = 1.
+ defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_one>;
+ defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_one>;
+
+
+ // Patterns for target intrinsics
+ defm : FlatIntrPrefetchPats<"FLAT_PREFETCH_B8", int_amdgcn_flat_prefetch>;
+ defm : FlatIntrPrefetchPats<"GLOBAL_PREFETCH_B8", int_amdgcn_global_prefetch>;
+} // End SubtargetPredicate = HasVmemPrefInsts
+
//===----------------------------------------------------------------------===//
// Target
//===----------------------------------------------------------------------===//
@@ -3210,6 +3363,17 @@ multiclass VFLAT_Real_Atomics_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME
defm TENSOR_SAVE : VFLAT_Real_gfx1250<0x06e>;
defm TENSOR_STOP : VFLAT_Real_gfx1250<0x06f>;
+defm FLAT_PREFETCH_B8 : VFLAT_Real_AllAddr_gfx1250<0x05d>;
+defm GLOBAL_PREFETCH_B8 : VFLAT_Real_AllAddr_gfx1250<0x05d>;
+
+defm FLAT_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>;
+defm FLAT_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>;
+defm FLAT_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>;
+
+defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>;
+defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>;
+defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>;
+
defm GLOBAL_LOAD_TR_B128_w32 : VFLAT_Real_AllAddr_gfx1250<0x057, "global_load_tr16_b128">;
defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "global_load_tr8_b64">;
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 7d6723a..334afd3 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -38,7 +38,11 @@ bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1,
unsigned GCNRegPressure::getRegKind(const TargetRegisterClass *RC,
const SIRegisterInfo *STI) {
- return STI->isSGPRClass(RC) ? SGPR : (STI->isAGPRClass(RC) ? AGPR : VGPR);
+ return STI->isSGPRClass(RC)
+ ? SGPR
+ : (STI->isAGPRClass(RC)
+ ? AGPR
+ : (STI->isVectorSuperClass(RC) ? AVGPR : VGPR));
}
void GCNRegPressure::inc(unsigned Reg,
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 3749b6d..ea33a22 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -29,43 +29,57 @@ class raw_ostream;
class SlotIndex;
struct GCNRegPressure {
- enum RegKind { SGPR, VGPR, AGPR, TOTAL_KINDS };
+ enum RegKind { SGPR, VGPR, AGPR, AVGPR, TOTAL_KINDS };
GCNRegPressure() {
clear();
}
- bool empty() const { return !Value[SGPR] && !Value[VGPR] && !Value[AGPR]; }
+ bool empty() const {
+ return !Value[SGPR] && !Value[VGPR] && !Value[AGPR] && !Value[AVGPR];
+ }
void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); }
/// \returns the SGPR32 pressure
unsigned getSGPRNum() const { return Value[SGPR]; }
- /// \returns the aggregated ArchVGPR32, AccVGPR32 pressure dependent upon \p
- /// UnifiedVGPRFile
+ /// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure
+ /// dependent upon \p UnifiedVGPRFile
unsigned getVGPRNum(bool UnifiedVGPRFile) const {
if (UnifiedVGPRFile) {
- return Value[AGPR] ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR])
- : Value[VGPR];
+ return Value[AGPR]
+ ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR], Value[AVGPR])
+ : Value[VGPR] + Value[AVGPR];
}
- return std::max(Value[VGPR], Value[AGPR]);
+ // AVGPR assignment priority is based on the width of the register. Account
+ // AVGPR pressure as VGPR.
+ return std::max(Value[VGPR] + Value[AVGPR], Value[AGPR]);
}
/// Returns the aggregated VGPR pressure, assuming \p NumArchVGPRs ArchVGPRs
- /// and \p NumAGPRs AGPRS, for a target with a unified VGPR file.
+ /// \p NumAGPRs AGPRS, and \p NumAVGPRs AVGPRs for a target with a unified
+ /// VGPR file.
inline static unsigned getUnifiedVGPRNum(unsigned NumArchVGPRs,
- unsigned NumAGPRs) {
- return alignTo(NumArchVGPRs, AMDGPU::IsaInfo::getArchVGPRAllocGranule()) +
+ unsigned NumAGPRs,
+ unsigned NumAVGPRs) {
+
+ // Assume AVGPRs will be assigned as VGPRs.
+ return alignTo(NumArchVGPRs + NumAVGPRs,
+ AMDGPU::IsaInfo::getArchVGPRAllocGranule()) +
NumAGPRs;
}
- /// \returns the ArchVGPR32 pressure
- unsigned getArchVGPRNum() const { return Value[VGPR]; }
+ /// \returns the ArchVGPR32 pressure, plus the AVGPRS which we assume will be
+ /// allocated as VGPR
+ unsigned getArchVGPRNum() const { return Value[VGPR] + Value[AVGPR]; }
/// \returns the AccVGPR32 pressure
unsigned getAGPRNum() const { return Value[AGPR]; }
+ /// \returns the AVGPR32 pressure
+ unsigned getAVGPRNum() const { return Value[AVGPR]; }
unsigned getVGPRTuplesWeight() const {
- return std::max(Value[TOTAL_KINDS + VGPR], Value[TOTAL_KINDS + AGPR]);
+ return std::max(Value[TOTAL_KINDS + VGPR] + Value[TOTAL_KINDS + AVGPR],
+ Value[TOTAL_KINDS + AGPR]);
}
unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; }
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index a655308..ce1ce68 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1911,14 +1911,12 @@ void PreRARematStage::rematerialize() {
for (auto &[DefMI, Remat] : Rematerializations) {
MachineBasicBlock::iterator InsertPos(Remat.UseMI);
Register Reg = DefMI->getOperand(0).getReg();
- unsigned SubReg = DefMI->getOperand(0).getSubReg();
unsigned DefRegion = MIRegion.at(DefMI);
// Rematerialize DefMI to its use block.
- TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, SubReg, *DefMI,
- *DAG.TRI);
+ TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
+ AMDGPU::NoSubRegister, *DefMI, *DAG.TRI);
Remat.RematMI = &*std::prev(InsertPos);
- Remat.RematMI->getOperand(0).setSubReg(SubReg);
DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
// Update region boundaries in regions we sinked from (remove defining MI)
@@ -2064,14 +2062,13 @@ void PreRARematStage::finalizeGCNSchedStage() {
MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second);
MachineBasicBlock *MBB = RegionBB[DefRegion];
Register Reg = RematMI.getOperand(0).getReg();
- unsigned SubReg = RematMI.getOperand(0).getSubReg();
// Re-rematerialize MI at the end of its original region. Note that it may
// not be rematerialized exactly in the same position as originally within
// the region, but it should not matter much.
- TII->reMaterialize(*MBB, InsertPos, Reg, SubReg, RematMI, *DAG.TRI);
+ TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI,
+ *DAG.TRI);
MachineInstr *NewMI = &*std::prev(InsertPos);
- NewMI->getOperand(0).setSubReg(SubReg);
DAG.LIS->InsertMachineInstrInMaps(*NewMI);
auto UseRegion = MIRegion.find(Remat.UseMI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 8b758b0..88a269f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -123,6 +123,7 @@ protected:
bool HasSMemRealTime = false;
bool HasIntClamp = false;
bool HasFmaMixInsts = false;
+ bool HasFmaMixBF16Insts = false;
bool HasMovrel = false;
bool HasVGPRIndexMode = false;
bool HasScalarDwordx3Loads = false;
@@ -244,7 +245,9 @@ protected:
bool HasVMEMtoScalarWriteHazard = false;
bool HasSMEMtoVectorWriteHazard = false;
bool HasInstFwdPrefetchBug = false;
+ bool HasVmemPrefInsts = false;
bool HasSafeSmemPrefetch = false;
+ bool HasSafeCUPrefetch = false;
bool HasVcmpxExecWARHazard = false;
bool HasLdsBranchVmemWARHazard = false;
bool HasNSAtoVMEMBug = false;
@@ -265,8 +268,10 @@ protected:
bool HasIEEEMinimumMaximumInsts = false;
bool HasMinimum3Maximum3F32 = false;
bool HasMinimum3Maximum3F16 = false;
+ bool HasMin3Max3PKF16 = false;
bool HasMinimum3Maximum3PKF16 = false;
bool HasLshlAddU64Inst = false;
+ bool HasAddSubU64Insts = false;
bool HasPointSampleAccel = false;
bool HasLdsBarrierArriveAtomic = false;
bool HasSetPrioIncWgInst = false;
@@ -460,6 +465,8 @@ public:
return HasFmaMixInsts;
}
+ bool hasFmaMixBF16Insts() const { return HasFmaMixBF16Insts; }
+
bool hasCARRY() const {
return true;
}
@@ -985,8 +992,12 @@ public:
bool hasPrefetch() const { return GFX12Insts; }
+ bool hasVmemPrefInsts() const { return HasVmemPrefInsts; }
+
bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; }
+ bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; }
+
// Has s_cmpk_* instructions.
bool hasSCmpK() const { return getGeneration() < GFX12; }
@@ -1306,7 +1317,7 @@ public:
bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
- bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; }
+ bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; }
/// Return if operations acting on VGPR tuples require even alignment.
bool needsAlignedVGPRs() const { return GFX90AInsts || GFX1250Insts; }
@@ -1387,6 +1398,8 @@ public:
return HasMinimum3Maximum3F16;
}
+ bool hasMin3Max3PKF16() const { return HasMin3Max3PKF16; }
+
bool hasTanhInsts() const { return HasTanhInsts; }
bool hasAddPC64Inst() const { return GFX1250Insts; }
@@ -1500,6 +1513,12 @@ public:
bool hasVOPD3() const { return GFX1250Insts; }
+ // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions.
+ bool hasAddSubU64Insts() const { return HasAddSubU64Insts; }
+
+ // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
+ bool hasVectorMulU64() const { return GFX1250Insts; }
+
// \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
diff --git a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index 429ce0e0..a33dbfa 100644
--- a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -270,5 +270,6 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
MI.eraseFromParent();
}
}
+ finalizeBundles(MF);
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
index 2a3b42e..eff5b0a 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
@@ -138,7 +138,6 @@ void R600PassConfig::addPreSched2() {
void R600PassConfig::addPreEmitPass() {
addPass(createR600MachineCFGStructurizerPass());
addPass(createR600ExpandSpecialInstrsPass());
- addPass(&FinalizeMachineBundlesID);
addPass(createR600Packetizer());
addPass(createR600ControlFlowFinalizer());
}
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 3902d4c..40b8bcd 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -392,11 +392,13 @@ enum CPol {
TH_ATOMIC_CASCADE = 4, // Cascading vs regular
// Scope
- SCOPE = 0x3 << 3, // All Scope bits
- SCOPE_CU = 0 << 3,
- SCOPE_SE = 1 << 3,
- SCOPE_DEV = 2 << 3,
- SCOPE_SYS = 3 << 3,
+ SCOPE_SHIFT = 3,
+ SCOPE_MASK = 0x3,
+ SCOPE = SCOPE_MASK << SCOPE_SHIFT, // All Scope bits
+ SCOPE_CU = 0 << SCOPE_SHIFT,
+ SCOPE_SE = 1 << SCOPE_SHIFT,
+ SCOPE_DEV = 2 << SCOPE_SHIFT,
+ SCOPE_SYS = 3 << SCOPE_SHIFT,
NV = 1 << 5, // Non-volatile bit
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index e5d1eaa..b77da4d 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1062,9 +1062,13 @@ bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
switch (OpTy) {
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP32:
OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
break;
case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT64:
OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
break;
default:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d65c3ae..8d51ec6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -874,13 +874,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
- if (Subtarget->hasScalarSMulU64())
+ if (Subtarget->hasVectorMulU64())
+ setOperationAction(ISD::MUL, MVT::i64, Legal);
+ else if (Subtarget->hasScalarSMulU64())
setOperationAction(ISD::MUL, MVT::i64, Custom);
if (Subtarget->hasMad64_32())
setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
- if (Subtarget->hasPrefetch() && Subtarget->hasSafeSmemPrefetch())
+ if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
if (Subtarget->hasIEEEMinimumMaximumInsts()) {
@@ -944,6 +946,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal);
}
+ if (Subtarget->hasBF16PackedInsts()) {
+ setOperationAction(
+ {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
+ MVT::v2bf16, Legal);
+ }
+
if (Subtarget->hasBF16TransInsts()) {
setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
}
@@ -1053,10 +1061,12 @@ ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const {
// where this is OK to use.
bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
EVT DestVT, EVT SrcVT) const {
- return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
- (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
- DestVT.getScalarType() == MVT::f32 &&
- SrcVT.getScalarType() == MVT::f16 &&
+ return DestVT.getScalarType() == MVT::f32 &&
+ ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
+ (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
+ SrcVT.getScalarType() == MVT::f16) ||
+ (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
+ SrcVT.getScalarType() == MVT::bf16)) &&
// TODO: This probably only requires no input flushing?
denormalModeIsFlushAllF32(DAG.getMachineFunction());
}
@@ -1467,6 +1477,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineMemOperand::MOVolatile;
return true;
}
+ case Intrinsic::amdgcn_flat_load_monitor_b32:
+ case Intrinsic::amdgcn_flat_load_monitor_b64:
+ case Intrinsic::amdgcn_flat_load_monitor_b128:
+ case Intrinsic::amdgcn_global_load_monitor_b32:
+ case Intrinsic::amdgcn_global_load_monitor_b64:
+ case Intrinsic::amdgcn_global_load_monitor_b128:
case Intrinsic::amdgcn_ds_load_tr6_b96:
case Intrinsic::amdgcn_ds_load_tr4_b64:
case Intrinsic::amdgcn_ds_load_tr8_b64:
@@ -1540,7 +1556,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
}
- case Intrinsic::amdgcn_s_prefetch_data: {
+ case Intrinsic::amdgcn_s_prefetch_data:
+ case Intrinsic::amdgcn_flat_prefetch:
+ case Intrinsic::amdgcn_global_prefetch: {
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
Info.ptrVal = CI.getArgOperand(0);
@@ -1591,10 +1609,16 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
+ case Intrinsic::amdgcn_flat_load_monitor_b128:
+ case Intrinsic::amdgcn_flat_load_monitor_b32:
+ case Intrinsic::amdgcn_flat_load_monitor_b64:
case Intrinsic::amdgcn_global_atomic_csub:
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_fmin_num:
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
+ case Intrinsic::amdgcn_global_load_monitor_b128:
+ case Intrinsic::amdgcn_global_load_monitor_b32:
+ case Intrinsic::amdgcn_global_load_monitor_b64:
case Intrinsic::amdgcn_global_load_tr_b64:
case Intrinsic::amdgcn_global_load_tr_b128:
case Intrinsic::amdgcn_global_load_tr4_b64:
@@ -4432,19 +4456,28 @@ SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
}
SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
- if (Op->isDivergent())
+ if (Op->isDivergent() &&
+ (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
+ // Cannot do I$ prefetch with divergent pointer.
return SDValue();
switch (cast<MemSDNode>(Op)->getAddressSpace()) {
case AMDGPUAS::FLAT_ADDRESS:
case AMDGPUAS::GLOBAL_ADDRESS:
case AMDGPUAS::CONSTANT_ADDRESS:
- case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
break;
+ case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
+ if (Subtarget->hasSafeSmemPrefetch())
+ break;
+ [[fallthrough]];
default:
return SDValue();
}
+ // I$ prefetch
+ if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
+ return SDValue();
+
return Op;
}
@@ -5415,6 +5448,19 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineOperand &Src0 = MI.getOperand(1);
MachineOperand &Src1 = MI.getOperand(2);
+ if (ST.hasAddSubU64Insts()) {
+ auto I = BuildMI(*BB, MI, DL,
+ TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
+ : AMDGPU::V_SUB_U64_e64),
+ Dest.getReg())
+ .add(Src0)
+ .add(Src1)
+ .addImm(0); // clamp
+ TII->legalizeOperands(*I);
+ MI.eraseFromParent();
+ return BB;
+ }
+
if (IsAdd && ST.hasLshlAddU64Inst()) {
auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
Dest.getReg())
@@ -14047,7 +14093,8 @@ static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
case ISD::FMAXIMUMNUM:
case AMDGPUISD::FMIN_LEGACY:
case AMDGPUISD::FMAX_LEGACY:
- return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
+ return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
+ (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
case ISD::FMINIMUM:
case ISD::FMAXIMUM:
return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
@@ -14132,6 +14179,8 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
(VT == MVT::f32 || VT == MVT::f64 ||
(VT == MVT::f16 && Subtarget->has16BitInsts()) ||
+ (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
+ (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
(VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
Op0.hasOneUse()) {
if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 9faf497..dd3f2fe 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2108,8 +2108,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
assert(TII->isFLAT(MI));
- // All flat instructions use the VMEM counter.
- assert(TII->usesVM_CNT(MI));
+ // All flat instructions use the VMEM counter except prefetch.
+ if (!TII->usesVM_CNT(MI))
+ return false;
// If there are no memory operands then conservatively assume the flat
// operation may access VMEM.
@@ -2295,9 +2296,6 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
}
- // A Flat memory operation must access at least one address space.
- assert(FlatASCount);
-
// This is a flat memory operation that access both VMEM and LDS, so note it
// - it will require that both the VM and LGKM be flushed to zero if it is
// pending when a VM or LGKM dependency occurs.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 571f3ef..8d6c1d0 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2508,7 +2508,20 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
.addReg(DstHi);
}
break;
+
+ case AMDGPU::V_MAX_BF16_PSEUDO_e64:
+ assert(ST.hasBF16PackedInsts());
+ MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
+ MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
+ MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
+ MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
+ auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
+ Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
+ auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
+ Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
+ break;
}
+
return true;
}
@@ -2733,49 +2746,47 @@ static MachineInstr *swapImmOperands(MachineInstr &MI,
}
bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
- const MachineOperand *MO0, unsigned OpIdx1,
- const MachineOperand *MO1) const {
+ unsigned OpIdx1) const {
const MCInstrDesc &InstDesc = MI.getDesc();
const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
- const TargetRegisterClass *DefinedRC1 =
- OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo1.RegClass) : nullptr;
- const TargetRegisterClass *DefinedRC0 =
- OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo0.RegClass) : nullptr;
unsigned Opc = MI.getOpcode();
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ const MachineOperand &MO0 = MI.getOperand(OpIdx0);
+ const MachineOperand &MO1 = MI.getOperand(OpIdx1);
+
// Swap doesn't breach constant bus or literal limits
// It may move literal to position other than src0, this is not allowed
// pre-gfx10 However, most test cases need literals in Src0 for VOP
// FIXME: After gfx9, literal can be in place other than Src0
if (isVALU(MI)) {
- if ((int)OpIdx0 == Src0Idx && !MO0->isReg() &&
- !isInlineConstant(*MO0, OpInfo1))
+ if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
+ !isInlineConstant(MO0, OpInfo1))
return false;
- if ((int)OpIdx1 == Src0Idx && !MO1->isReg() &&
- !isInlineConstant(*MO1, OpInfo0))
+ if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
+ !isInlineConstant(MO1, OpInfo0))
return false;
}
- if ((int)OpIdx1 != Src0Idx && MO0->isReg()) {
- if (!DefinedRC1)
+ if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
+ if (OpInfo1.RegClass == -1)
return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
- return isLegalRegOperand(MI, OpIdx1, *MO0) &&
- (!MO1->isReg() || isLegalRegOperand(MI, OpIdx0, *MO1));
+ return isLegalRegOperand(MI, OpIdx1, MO0) &&
+ (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
}
- if ((int)OpIdx0 != Src0Idx && MO1->isReg()) {
- if (!DefinedRC0)
+ if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
+ if (OpInfo0.RegClass == -1)
return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
- return (!MO0->isReg() || isLegalRegOperand(MI, OpIdx1, *MO0)) &&
- isLegalRegOperand(MI, OpIdx0, *MO1);
+ return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
+ isLegalRegOperand(MI, OpIdx0, MO1);
}
// No need to check 64-bit literals since swapping does not bring new
// 64-bit literals into current instruction to fold to 32-bit
- return isImmOperandLegal(MI, OpIdx1, *MO0);
+ return isImmOperandLegal(MI, OpIdx1, MO0);
}
MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
@@ -2797,12 +2808,12 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
static_cast<int>(Src1Idx) &&
"inconsistency with findCommutedOpIndices");
- MachineOperand &Src0 = MI.getOperand(Src0Idx);
- MachineOperand &Src1 = MI.getOperand(Src1Idx);
- if (!isLegalToSwap(MI, Src0Idx, &Src0, Src1Idx, &Src1)) {
+ if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
return nullptr;
- }
+
MachineInstr *CommutedMI = nullptr;
+ MachineOperand &Src0 = MI.getOperand(Src0Idx);
+ MachineOperand &Src1 = MI.getOperand(Src1Idx);
if (Src0.isReg() && Src1.isReg()) {
// Be sure to copy the source modifiers to the right place.
CommutedMI =
@@ -7361,6 +7372,10 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
case AMDGPU::S_MUL_U64:
+ if (ST.hasVectorMulU64()) {
+ NewOpcode = AMDGPU::V_MUL_U64_e64;
+ break;
+ }
// Split s_mul_u64 in 32-bit vector multiplications.
splitScalarSMulU64(Worklist, Inst, MDT);
Inst.eraseFromParent();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 800ea9a..2ffb783 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -197,8 +197,7 @@ protected:
AMDGPU::OpName Src0OpName, MachineOperand &Src1,
AMDGPU::OpName Src1OpName) const;
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx,
- const MachineOperand *fromMO, unsigned toIdx,
- const MachineOperand *toMO) const;
+ unsigned toIdx) const;
MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
unsigned OpIdx0,
unsigned OpIdx1) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index bd4995b..83b0490 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1662,6 +1662,8 @@ def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;
def VOP3PMadMixModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixModsExt">;
def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
+def VOP3PMadMixBF16ModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16ModsExt">;
+def VOP3PMadMixBF16Mods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16Mods">;
def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">;
def VINTERPModsHi : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">;
@@ -2863,9 +2865,11 @@ def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>;
def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>;
def VOP_I16_I16_I16_ARITH : VOPProfile <[i16, i16, i16, untyped], /*EnableClamp=*/1>;
+def VOP_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, untyped]>;
def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
+def VOP_BF16_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, bf16, untyped]>;
def VOP_I32_I16_I16_I32 : VOPProfile <[i32, i16, i16, i32, untyped]>;
def VOP_I32_I16 : VOPProfile <[i32, i16, untyped, untyped]>;
@@ -2873,10 +2877,12 @@ def VOP_I16_I32 : VOPProfile <[i16, i32, untyped, untyped]>;
def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>;
def VOP_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, untyped]>;
+def VOP_V2BF16_V2BF16_V2BF16 : VOPProfile <[v2bf16, v2bf16, v2bf16, untyped]>;
def VOP_B32_F16_F16 : VOPProfile <[i32, f16, f16, untyped]>;
def VOP_V2F16_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, v2f16]>;
def VOP_V2I16_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, v2i16]>;
+def VOP_V2BF16_V2BF16_V2BF16_V2BF16 : VOPProfile <[v2bf16, v2bf16, v2bf16, v2bf16]>;
def VOP_V2I16_F32_F32 : VOPProfile <[v2i16, f32, f32, untyped]>;
def VOP_V2I16_I32_I32 : VOPProfile <[v2i16, i32, i32, untyped]>;
@@ -2912,8 +2918,10 @@ def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
def VOP_I16_F32_F32 : VOPProfile <[i16, f32, f32, untyped]>;
def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], /*EnableClamp=*/1>;
+def VOP_I64_I64_I64_ARITH : VOPProfile <[i64, i64, i64, untyped], /*EnableClamp=*/1>;
def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>;
+def VOP_F32_BF16_BF16_BF16 : VOPProfile <[f32, bf16, bf16, bf16]>;
def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>;
def VOP_V32F32_V6I32_F32 : VOPProfile <[v32f32, v6i32, f32, untyped]>;
def VOP_V32F16_V6I32_F32 : VOPProfile <[v32f16, v6i32, f32, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index d05be8f..54fa192 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1894,6 +1894,9 @@ let SubtargetPredicate = UseRealTrue16Insts in
def : ClampPat<V_MAX_F16_t16_e64, f16>;
let SubtargetPredicate = UseFakeTrue16Insts in
def : ClampPat<V_MAX_F16_fake16_e64, f16>;
+// FIXME-TRUE16: Pseudo expansion of this won't work with True16.
+let True16Predicate = UseFakeTrue16Insts in
+def : ClampPat<V_MAX_BF16_PSEUDO_e64, bf16>;
let SubtargetPredicate = HasVOP3PInsts in {
def : GCNPat <
@@ -1903,6 +1906,13 @@ def : GCNPat <
>;
}
+let SubtargetPredicate = HasBF16PackedInsts in {
+def : GCNPat <
+ (v2bf16 (AMDGPUclamp (VOP3PMods v2bf16:$src0, i32:$src0_modifiers))),
+ (V_PK_MAX_NUM_BF16 $src0_modifiers, $src0,
+ $src0_modifiers, $src0, DSTCLAMP.ENABLE)
+>;
+} // End SubtargetPredicate = HasBF16PackedInsts
/********** ================================ **********/
/********** Floating point absolute/negative **********/
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index f0be204..9a1448f 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -81,11 +81,15 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
}
- MayNeedAGPRs = ST.hasMAIInsts() && !MFMAVGPRForm;
- if (!MFMAVGPRForm && ST.hasGFX90AInsts() &&
- ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
- !mayUseAGPRs(F))
- MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
+ MayNeedAGPRs = ST.hasMAIInsts();
+ if (ST.hasGFX90AInsts()) {
+ // FIXME: MayNeedAGPRs is a misnomer for how this is used. MFMA selection
+ // should be separated from availability of AGPRs
+ if (MFMAVGPRForm ||
+ (ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
+ !mayUseAGPRs(F)))
+ MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
+ }
if (AMDGPU::isChainCC(CC)) {
// Chain functions don't receive an SP from their caller, but are free to
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 3212060..0e8a420 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -704,16 +704,16 @@ void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning));
}
-/// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA.
-/// If this tag isn't present, or if it has no meaningful values, returns \p
-/// Default. Otherwise returns all the address spaces concerned by the MMRA.
-static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI,
- SIAtomicAddrSpace Default) {
- static constexpr StringLiteral FenceASPrefix = "amdgpu-as";
+/// Reads \p MI's MMRAs to parse the "amdgpu-synchronize-as" MMRA.
+/// If this tag isn't present, or if it has no meaningful values, returns
+/// \p none, otherwise returns the address spaces specified by the MD.
+static std::optional<SIAtomicAddrSpace>
+getSynchronizeAddrSpaceMD(const MachineInstr &MI) {
+ static constexpr StringLiteral FenceASPrefix = "amdgpu-synchronize-as";
auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
if (!MMRA)
- return Default;
+ return std::nullopt;
SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
for (const auto &[Prefix, Suffix] : MMRA) {
@@ -726,7 +726,10 @@ static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI,
diagnoseUnknownMMRAASName(MI, Suffix);
}
- return (Result != SIAtomicAddrSpace::NONE) ? Result : Default;
+ if (Result == SIAtomicAddrSpace::NONE)
+ return std::nullopt;
+
+ return Result;
}
} // end anonymous namespace
@@ -903,12 +906,19 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
*ScopeOrNone;
- if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
- ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
+ if (OrderingAddrSpace != SIAtomicAddrSpace::ATOMIC) {
+ // We currently expect refineOrderingAS to be the only place that
+ // can refine the AS ordered by the fence.
+ // If that changes, we need to review the semantics of that function
+ // in case it needs to preserve certain address spaces.
reportUnsupported(MI, "Unsupported atomic address space");
return std::nullopt;
}
+ auto SynchronizeAS = getSynchronizeAddrSpaceMD(*MI);
+ if (SynchronizeAS)
+ OrderingAddrSpace = *SynchronizeAS;
+
return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
}
@@ -2687,11 +2697,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
AtomicPseudoMIs.push_back(MI);
bool Changed = false;
- // Refine fenced address space based on MMRAs.
- //
- // TODO: Should we support this MMRA on other atomic operations?
- auto OrderingAddrSpace =
- getFenceAddrSpaceMMRA(*MI, MOI.getOrderingAddrSpace());
+ const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();
if (MOI.isAtomic()) {
const AtomicOrdering Order = MOI.getOrdering();
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 0039d2f..218841d 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -109,6 +109,23 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList>
let TSFlags{2} = HasVGPR;
let TSFlags{3} = HasAGPR;
let TSFlags{4} = HasSGPR;
+
+ // RA will use RegisterClass AllocationPriority amongst other info (e.g. ordering in the basic block)
+ // to decide which registers to try to assign first. Usually, this RegisterClass priority is given
+ // very high priority, if not the highest priority, when considering which VirtReg to allocate next.
+ //
+ // We have 5 bits to assign AllocationPriorities to RegisterClasses. Generally, it is beneficial to
+ // assign more constrained RegisterClasses first. As a result, we prioritize register classes with
+ // more 32 bit tuples (e.g. VReg_512) over registers with fewer tuples (e.g. VGPR_32).
+ //
+ // The interesting case is the vector register case on architectures which have ARegs, VRegs, AVRegs.
+ // In this case, we would like to assign ARegs and VRegs before AVRegs, as AVRegs are less constrained
+ // and can be assigned to both AGPRs and VGPRs. We use the 5th bit to encode this into the
+ // RegisterClass AllocationPriority. BaseClassPriority is used to turn the bit on, and BaseClassScaleFactor
+ // is used for scaling of the bit (i.e. 1 << 4).
+ field int BaseClassPriority = 1;
+ field int BaseClassScaleFactor = 16;
+
}
multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1,
@@ -575,7 +592,7 @@ let HasVGPR = 1 in {
def VGPR_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
(add (interleave (sequence "VGPR%u_LO16", 0, 255),
(sequence "VGPR%u_HI16", 0, 255)))> {
- let AllocationPriority = 2;
+ let AllocationPriority = !add(2, !mul(BaseClassPriority, BaseClassScaleFactor));
let Size = 16;
let GeneratePressureSet = 0;
@@ -601,7 +618,7 @@ def VGPR_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
// i16/f16 only on VI+
def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
(add (sequence "VGPR%u", 0, 255))> {
- let AllocationPriority = 0;
+ let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
let Size = 32;
let Weight = 1;
let BaseClassOrder = 32;
@@ -610,7 +627,7 @@ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types
// Identical to VGPR_32 except it only contains the low 128 (Lo128) registers.
def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
(add (sequence "VGPR%u", 0, 127))> {
- let AllocationPriority = 0;
+ let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
let GeneratePressureSet = 0;
let Size = 32;
let Weight = 1;
@@ -668,7 +685,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
// AccVGPR 32-bit registers
def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
(add (sequence "AGPR%u", 0, 255))> {
- let AllocationPriority = 0;
+ let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
let Size = 32;
let Weight = 1;
let BaseClassOrder = 32;
@@ -940,14 +957,23 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> :
// Requires n v_mov_b32 to copy
let CopyCost = numRegs;
- let AllocationPriority = !sub(numRegs, 1);
+
+ // Since we only have 5 bits for the RegisterClass Allocation Priorty, and since we use the
+ // 5th bit for BaseClassPriority, we need to encode the SizePriority into 4 bits. As a result
+ // of this encoding, for registers with numRegs 15 or 16, we give SizePriority of 14, and for
+ // regsters with numRegs 17+ we give SizePriority of 15. In practice, there is only one
+ // RegClass per Vector Register type in each of these groups (i.e. numRegs = 15,16 : {VReg_512},
+ // and numRegs = 17+ : {VReg_1024}). Therefore, we have not lost any info by compressing.
+ defvar SizePrioriity = !if(!le(numRegs, 14), !sub(numRegs, 1), !if(!le(numRegs, 16), 14, 15));
+
+ let AllocationPriority = !add(SizePrioriity, !mul(BaseClassPriority, BaseClassScaleFactor));
let Weight = numRegs;
}
// Define a register tuple class, along with one requiring an even
// aligned base register.
multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
- let HasVGPR = 1 in {
+ let HasVGPR = 1, BaseClassPriority = 1 in {
// Define the regular class.
def "" : VRegClassBase<numRegs, regTypes, regList> {
let BaseClassOrder = !mul(numRegs, 32);
@@ -981,7 +1007,7 @@ defm VReg_1024 : VRegClass<32, Reg1024Types.types, (add VGPR_1024)>;
}
multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
- let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1 in {
+ let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, BaseClassPriority = 1 in {
// Define the regular class.
def "" : VRegClassBase<numRegs, regTypes, regList> {
let BaseClassOrder = !mul(numRegs, 32);
@@ -1066,6 +1092,7 @@ def VS_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64, SReg_6
def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_32)> {
let HasVGPR = 1;
let HasAGPR = 1;
+ let BaseClassPriority = 0;
let Size = 32;
}
} // End GeneratePressureSet = 0
@@ -1074,7 +1101,7 @@ def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_3
// aligned base register.
multiclass AVRegClass<int numRegs, list<ValueType> regTypes,
dag vregList, dag aregList> {
- let HasVGPR = 1, HasAGPR = 1 in {
+ let HasVGPR = 1, HasAGPR = 1, BaseClassPriority = 0 in {
// Define the regular class.
def "" : VRegClassBase<numRegs, regTypes, (add vregList, aregList)>;
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 38cc51b..4bda51d 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -856,9 +856,9 @@ def smrd_sextloadi16 : SMRDLoadPat<sextloadi16>;
def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
(prefetch node:$ptr, node:$rw, node:$loc, node:$type),
- [{ return !N->getOperand(1)->isDivergent();}]> {
+ [{ return !N->getOperand(1)->isDivergent() && Subtarget->hasSafeSmemPrefetch();}]> {
let GISelPredicateCode = [{
- return isInstrUniform(MI);
+ return isInstrUniform(MI) && Subtarget->hasSafeSmemPrefetch();
}];
}
@@ -1152,6 +1152,7 @@ multiclass SMPrefetchPat<string type, TImmLeaf cache_type> {
}
defm : SMPrefetchPat<"INST", i32imm_zero>;
+let AddedComplexity = 12 in // Prefer scalar prefetch over global for r/o case.
defm : SMPrefetchPat<"DATA", i32imm_one>;
let SubtargetPredicate = isGFX12Plus in {
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 030a6e1..550ec9d 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -925,6 +925,17 @@ let isAdd = 1 in {
defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32">;
}
+let isReMaterializable = 1 in {
+let SubtargetPredicate = HasAddSubU64Insts, SchedRW = [Write64Bit] in {
+defm V_ADD_U64 : VOP2Inst <"v_add_nc_u64", VOP_I64_I64_I64_ARITH>;
+// We don't actually have something like V_SUBREV_U64 so V_SUB_U64 can't be treated as commutable.
+let isCommutable = 0 in
+defm V_SUB_U64 : VOP2Inst <"v_sub_nc_u64", VOP_I64_I64_I64_ARITH>;
+} // End SubtargetPredicate = HasAddSubU64Insts, SchedRW = [Write64Bit]
+let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDouble] in
+defm V_MUL_U64 : VOP2Inst <"v_mul_u64", VOP_I64_I64_I64, DivergentBinFrag<mul>>;
+} // End isReMaterializable = 1
+
} // End isCommutable = 1
// These are special and do not read the exec mask.
@@ -1754,6 +1765,9 @@ multiclass VOP2_Real_FULL_with_name<GFXGen Gen, bits<6> op, string opName,
VOP2_Realtriple_e64_with_name<Gen, op, opName, asmName>,
VOP2_Real_NO_VOP3_with_name<Gen, op, opName, asmName>;
+multiclass VOP2_Real_NO_DPP<GFXGen Gen, bits<6> op> :
+ VOP2_Real_e32<Gen, op>, VOP2_Real_e64<Gen, op>;
+
multiclass VOP2_Real_NO_DPP_with_name<GFXGen Gen, bits<6> op, string opName,
string asmName> {
defm NAME : VOP2_Real_e32_with_name<Gen, op, opName, asmName>,
@@ -1843,6 +1857,9 @@ defm V_FMAC_F64 : VOP2_Real_FULL<GFX12Gen, 0x17>;
defm V_FMAMK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x23>;
defm V_FMAAK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x24>;
+defm V_ADD_U64 : VOP2_Real_FULL<GFX1250Gen, 0x28>;
+defm V_SUB_U64 : VOP2_Real_FULL<GFX1250Gen, 0x29>;
+defm V_MUL_U64 : VOP2_Real_NO_DPP<GFX1250Gen, 0x2a>;
//===----------------------------------------------------------------------===//
// GFX11.
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index a259e5c..95fcd4a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -35,14 +35,18 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
bit useTiedOutput = 0> : VOP3P_Profile<P, Features, 1> {
bit UseTiedOutput = useTiedOutput;
+ defvar Src0RC = getVCSrcForVT<P.Src0VT>.ret;
+ defvar Src1RC = getVCSrcForVT<P.Src1VT>.ret;
+ defvar Src2RC = getVCSrcForVT<P.Src2VT>.ret;
+
dag srcs =
- (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0,
- FP16InputMods:$src1_modifiers, VCSrc_f16:$src1,
- FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
+ (ins FP16InputMods:$src0_modifiers, Src0RC:$src0,
+ FP16InputMods:$src1_modifiers, Src1RC:$src1,
+ FP16InputMods:$src2_modifiers, Src2RC:$src2);
dag dpp_srcs =
(ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0,
FPVRegInputMods:$src1_modifiers, VRegSrc_32:$src1,
- FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
+ FP16InputMods:$src2_modifiers, Src2RC:$src2);
// FIXME: Clamp0 misbehaves with the non-default vdst_in
// following it. For now workaround this by requiring clamp
@@ -144,48 +148,59 @@ def : VOP3PSatPat<usubsat, V_PK_SUB_U16>;
def : VOP3PSatPat<ssubsat, V_PK_SUB_I16>;
} // End SubtargetPredicate = HasVOP3PInsts
-let SubtargetPredicate = HasMinimum3Maximum3PKF16, FPDPRounding = 1 in {
+let isCommutable = 1, FPDPRounding = 1 in {
+let SubtargetPredicate = HasMin3Max3PKF16 in {
+defm V_PK_MIN3_NUM_F16 : VOP3PInst<"v_pk_min3_num_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmin3>;
+defm V_PK_MAX3_NUM_F16 : VOP3PInst<"v_pk_max3_num_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmax3>;
+}
+
+let SubtargetPredicate = HasMinimum3Maximum3PKF16 in {
defm V_PK_MINIMUM3_F16 : VOP3PInst<"v_pk_minimum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfminimum3>;
defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmaximum3>;
}
+} // End isCommutable = 1, FPDPRounding = 1
// TODO: Make sure we're doing the right thing with denormals. Note
// that FMA and MAD will differ.
multiclass MadFmaMixPats<SDPatternOperator fma_like,
Instruction mix_inst,
Instruction mixlo_inst,
- Instruction mixhi_inst> {
+ Instruction mixhi_inst,
+ ValueType VT = f16,
+ ValueType vecVT = v2f16> {
+ defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
+ defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt);
// At least one of the operands needs to be an fpextend of an f16
// for this to be worthwhile, so we need three patterns here.
// TODO: Could we use a predicate to inspect src1/2/3 instead?
def : GCNPat <
- (f32 (fma_like (f32 (VOP3PMadMixModsExt f16:$src0, i32:$src0_mods)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_mods)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_mods)))),
+ (f32 (fma_like (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_mods)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_mods)))),
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
DSTCLAMP.NONE)>;
def : GCNPat <
- (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)),
- (f32 (VOP3PMadMixModsExt f16:$src1, i32:$src1_mods)),
- (f32 (VOP3PMadMixMods f32:$src2, i32:$src2_mods)))),
+ (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsExtPat VT:$src1, i32:$src1_mods)),
+ (f32 (VOP3PMadMixModsPat f32:$src2, i32:$src2_mods)))),
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
DSTCLAMP.NONE)>;
def : GCNPat <
- (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)),
- (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_mods)),
- (f32 (VOP3PMadMixModsExt f16:$src2, i32:$src2_mods)))),
+ (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)),
+ (f32 (VOP3PMadMixModsExtPat VT:$src2, i32:$src2_mods)))),
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
DSTCLAMP.NONE)>;
def : GCNPat <
(AMDGPUclamp (build_vector
- (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))),
- (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))),
- (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0,
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$lo_src1, i32:$lo_src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$lo_src2, i32:$lo_src2_modifiers))))),
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$hi_src0, i32:$hi_src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$hi_src1, i32:$hi_src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$hi_src2, i32:$hi_src2_modifiers))))))),
+ (vecVT (mixhi_inst $hi_src0_modifiers, $hi_src0,
$hi_src1_modifiers, $hi_src1,
$hi_src2_modifiers, $hi_src2,
DSTCLAMP.ENABLE,
@@ -197,8 +212,8 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
>;
def : GCNPat <
- (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))),
+ (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))),
(mixlo_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
(i32 0), (i32 0),
@@ -207,9 +222,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
>;
def : GCNPat <
- (build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ (build_vector VT:$elt0, (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers)))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
(i32 0), (i32 0),
DSTCLAMP.NONE,
@@ -217,9 +232,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
>;
def : GCNPat <
- (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))),
(mixlo_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
@@ -234,10 +249,10 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
let True16Predicate = p in {
def : GCNPat <
- (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.NONE,
@@ -246,11 +261,11 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
def : GCNPat <
(build_vector
- f16:$elt0,
- (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ VT:$elt0,
+ (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.ENABLE,
@@ -261,38 +276,38 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
let True16Predicate = UseRealTrue16Insts in {
def : GCNPat <
- (build_vector (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), f16:$elt1),
- (v2f16 (mixlo_inst $src0_modifiers, $src0,
+ (build_vector (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), VT:$elt1),
+ (vecVT (mixlo_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.NONE,
- (REG_SEQUENCE VGPR_32, (f16 (IMPLICIT_DEF)), lo16, $elt1, hi16)))
+ (REG_SEQUENCE VGPR_32, (VT (IMPLICIT_DEF)), lo16, $elt1, hi16)))
>;
def : GCNPat <
- (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.NONE,
- (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16)))
+ (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16)))
>;
def : GCNPat <
(build_vector
- f16:$elt0,
- (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ VT:$elt0,
+ (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.ENABLE,
- (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16)))
+ (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16)))
>;
} // end True16Predicate
}
@@ -353,6 +368,24 @@ defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F
defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
}
+let SubtargetPredicate = HasFmaMixBF16Insts in {
+let isCommutable = 1 in {
+
+let isReMaterializable = 1 in
+defm V_FMA_MIX_F32_BF16 : VOP3_VOP3PInst<"v_fma_mix_f32_bf16", VOP3P_Mix_Profile<VOP_F32_BF16_BF16_BF16, VOP3_OPSEL>>;
+
+let FPDPRounding = 1 in {
+defm V_FMA_MIXLO_BF16 : VOP3_VOP3PInst<"v_fma_mixlo_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>;
+
+let ClampLo = 0, ClampHi = 1 in {
+defm V_FMA_MIXHI_BF16 : VOP3_VOP3PInst<"v_fma_mixhi_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>;
+}
+} // End FPDPRounding = 1
+} // End isCommutable = 1
+
+defm : MadFmaMixPats<fma, V_FMA_MIX_F32_BF16, V_FMA_MIXLO_BF16, V_FMA_MIXHI_BF16, bf16, v2bf16>;
+} // End SubtargetPredicate = HasFmaMixBF16Insts
+
def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> {
let HasModifiers = 0;
}
@@ -1196,6 +1229,20 @@ let isCommutable = 1, isReMaterializable = 1 in {
let SubtargetPredicate = HasPkMovB32, isAsCheapAsAMove = 1 in
defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>;
+
+ let SubtargetPredicate = HasBF16PackedInsts in {
+ defm V_PK_ADD_BF16 : VOP3PInst<"v_pk_add_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fadd>;
+ defm V_PK_MUL_BF16 : VOP3PInst<"v_pk_mul_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fmul>;
+ defm V_PK_MIN_NUM_BF16 : VOP3PInst<"v_pk_min_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fminnum_like>;
+ defm V_PK_MAX_NUM_BF16 : VOP3PInst<"v_pk_max_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fmaxnum_like>;
+ defm V_PK_FMA_BF16 : VOP3PInst<"v_pk_fma_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fma>;
+
+ // Scalar pseudo used to emulate AMDGPUClamp.
+ // Expanded to V_PK_MAX_NUM_BF16 with unused high half.
+ // FIXME-TRUE16: Pseudo expansion of this won't work with True16.
+ let True16Predicate = UseFakeTrue16Insts in
+ defm V_MAX_BF16_PSEUDO : VOP3Inst <"v_max_bf16", VOP_BF16_BF16_BF16>;
+ }
} // End isCommutable = 1, isReMaterializable = 1
def : AMDGPUMnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">;
@@ -2210,6 +2257,10 @@ multiclass VOP3P_Real_with_name_gfx12<bits<8> op,
defm V_PK_MIN_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1b, "V_PK_MIN_F16", "v_pk_min_num_f16">;
defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_max_num_f16">;
+defm V_PK_FMA_F32 : VOP3P_Real_gfx12<0x1f>;
+defm V_PK_MUL_F32 : VOP3P_Real_gfx12<0x28>;
+defm V_PK_ADD_F32 : VOP3P_Real_gfx12<0x29>;
+
defm V_PK_ADD_MAX_I16 : VOP3P_Real_gfx1250<0x14>;
defm V_PK_ADD_MAX_U16 : VOP3P_Real_gfx1250<0x15>;
defm V_PK_ADD_MIN_I16 : VOP3P_Real_gfx1250<0x2d>;
@@ -2218,6 +2269,22 @@ defm V_PK_MAX3_I16 : VOP3P_Real_gfx1250<0x2f>;
defm V_PK_MAX3_U16 : VOP3P_Real_gfx1250<0x30>;
defm V_PK_MIN3_I16 : VOP3P_Real_gfx1250<0x31>;
defm V_PK_MIN3_U16 : VOP3P_Real_gfx1250<0x32>;
+defm V_PK_FMA_BF16 : VOP3P_Real_gfx1250<0x11>;
+defm V_PK_ADD_BF16 : VOP3P_Real_gfx1250<0x23>;
+defm V_PK_MUL_BF16 : VOP3P_Real_gfx1250<0x2a>;
+defm V_PK_MIN_NUM_BF16 : VOP3P_Real_gfx1250<0x2b>;
+defm V_PK_MAX_NUM_BF16 : VOP3P_Real_gfx1250<0x2c>;
+defm V_PK_MINIMUM3_F16 : VOP3P_Real_gfx1250<0x36>;
+defm V_PK_MAXIMUM3_F16 : VOP3P_Real_gfx1250<0x37>;
+defm V_PK_MIN3_NUM_F16 : VOP3P_Real_gfx1250<0x38>;
+defm V_PK_MAX3_NUM_F16 : VOP3P_Real_gfx1250<0x39>;
+
+defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3d>;
+defm V_FMA_MIXLO_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3e>;
+defm V_FMA_MIXHI_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3f>;
+
+let AssemblerPredicate = isGFX1250Plus in
+def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16", "v_fma_mix_f32">;
defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>;
defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 8b7f06a..fca5dff 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -20347,6 +20347,13 @@ ARMTargetLowering::getSingleConstraintMatchWeight(
return weight;
}
+static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT) {
+ if (PR == 0 || VT == MVT::Other)
+ return false;
+ return (ARM::SPRRegClass.contains(PR) && VT != MVT::f32 && VT != MVT::i32) ||
+ (ARM::DPRRegClass.contains(PR) && VT != MVT::f64);
+}
+
using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
@@ -20420,7 +20427,10 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
if (StringRef("{cc}").equals_insensitive(Constraint))
return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
- return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+ auto RCP = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+ if (isIncompatibleReg(RCP.first, VT))
+ return {0, nullptr};
+ return RCP;
}
/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index ec6f4e2..ece6c10 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -12327,7 +12327,7 @@ bool ARMAsmParser::parseDirectiveEven(SMLoc L) {
}
assert(Section && "must have section to emit alignment");
- if (Section->useCodeAlign())
+ if (getContext().getAsmInfo()->useCodeAlign(*Section))
getStreamer().emitCodeAlignment(Align(2), &getSTI());
else
getStreamer().emitValueToAlignment(Align(2));
@@ -12525,7 +12525,7 @@ bool ARMAsmParser::parseDirectiveAlign(SMLoc L) {
// '.align' is target specifically handled to mean 2**2 byte alignment.
const MCSection *Section = getStreamer().getCurrentSectionOnly();
assert(Section && "must have section to emit alignment");
- if (Section->useCodeAlign())
+ if (getContext().getAsmInfo()->useCodeAlign(*Section))
getStreamer().emitCodeAlignment(Align(4), &getSTI(), 0);
else
getStreamer().emitValueToAlignment(Align(4), 0, 1, 0);
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index a7a9911..868556b 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -708,8 +708,6 @@ private:
void SwitchToExTabSection(const MCSymbol &FnStart);
void SwitchToExIdxSection(const MCSymbol &FnStart);
- void EmitFixup(const MCExpr *Expr, MCFixupKind Kind);
-
bool IsThumb;
bool IsAndroid;
@@ -1096,8 +1094,8 @@ void ARMTargetELFStreamer::emitLabel(MCSymbol *Symbol) {
}
void ARMTargetELFStreamer::annotateTLSDescriptorSequence(
- const MCSymbolRefExpr *S) {
- getStreamer().EmitFixup(S, FK_Data_4);
+ const MCSymbolRefExpr *Expr) {
+ getStreamer().addFixup(Expr, FK_Data_4);
}
void ARMTargetELFStreamer::emitCode16() { getStreamer().setIsThumb(true); }
@@ -1140,7 +1138,8 @@ void ARMTargetELFStreamer::finish() {
MCContext &Ctx = getContext();
auto &Asm = getStreamer().getAssembler();
if (any_of(Asm, [](const MCSection &Sec) {
- return cast<MCSectionELF>(Sec).getFlags() & ELF::SHF_ARM_PURECODE;
+ return static_cast<const MCSectionELF &>(Sec).getFlags() &
+ ELF::SHF_ARM_PURECODE;
})) {
auto *Text =
static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection());
@@ -1206,11 +1205,6 @@ inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) {
SectionKind::getData(), FnStart);
}
-void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) {
- MCFragment *Frag = getCurrentFragment();
- Frag->addFixup(MCFixup::create(Frag->getContents().size(), Expr, Kind));
-}
-
void ARMELFStreamer::EHReset() {
ExTab = nullptr;
FnStart = nullptr;
diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
index ad8aa571..0fb33cd 100644
--- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -260,7 +260,7 @@ bool AVRAsmPrinter::doFinalization(Module &M) {
continue;
}
- auto *Section = cast<MCSectionELF>(TLOF.SectionForGlobal(&GO, TM));
+ auto *Section = static_cast<MCSectionELF *>(TLOF.SectionForGlobal(&GO, TM));
if (Section->getName().starts_with(".data"))
NeedsCopyData = true;
else if (Section->getName().starts_with(".rodata") && SubTM->hasLPM())
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
index c2c1bb4..0615ec9 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
@@ -24,8 +24,6 @@ AVRMCAsmInfo::AVRMCAsmInfo(const Triple &TT, const MCTargetOptions &Options) {
CalleeSaveStackSlotSize = 2;
CommentString = ";";
SeparatorString = "$";
- PrivateGlobalPrefix = ".L";
- PrivateLabelPrefix = ".L";
UsesELFSectionDirectiveForBSS = true;
SupportsDebugInformation = true;
}
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
index fab2713..1915fa8 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
@@ -14,7 +14,7 @@
#define LLVM_AVR_ASM_INFO_H
#include "MCTargetDesc/AVRMCExpr.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
#include "llvm/MC/MCExpr.h"
namespace llvm {
@@ -22,7 +22,7 @@ namespace llvm {
class Triple;
/// Specifies the format of AVR assembly files.
-class AVRMCAsmInfo : public MCAsmInfo {
+class AVRMCAsmInfo : public MCAsmInfoELF {
public:
explicit AVRMCAsmInfo(const Triple &TT, const MCTargetOptions &Options);
void printSpecifierExpr(raw_ostream &OS,
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
index 5963976..6ec78d0 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
@@ -7,12 +7,10 @@
//===----------------------------------------------------------------------===//
#include "AVRMCExpr.h"
-#include "MCTargetDesc/AVRMCAsmInfo.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCValue.h"
namespace llvm {
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index 1e29a0f..a87b9a2 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -1255,10 +1255,8 @@ void BTFDebug::beginFunctionImpl(const MachineFunction *MF) {
FuncInfo.Label = FuncLabel;
FuncInfo.TypeId = FuncTypeId;
if (FuncLabel->isInSection()) {
- MCSection &Section = FuncLabel->getSection();
- const MCSectionELF *SectionELF = dyn_cast<MCSectionELF>(&Section);
- assert(SectionELF && "Null section for Function Label");
- SecNameOff = addString(SectionELF->getName());
+ auto &Sec = static_cast<const MCSectionELF &>(FuncLabel->getSection());
+ SecNameOff = addString(Sec.getName());
} else {
SecNameOff = addString(".text");
}
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
index 827e928..bb74f6a 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
@@ -54,11 +54,8 @@ unsigned BPFELFObjectWriter::getRelocType(const MCFixup &Fixup,
const MCSymbol &Sym = *A;
if (Sym.isDefined()) {
- MCSection &Section = Sym.getSection();
- const MCSectionELF *SectionELF = dyn_cast<MCSectionELF>(&Section);
- assert(SectionELF && "Null section for reloc symbol");
-
- unsigned Flags = SectionELF->getFlags();
+ auto &Section = static_cast<const MCSectionELF &>(Sym.getSection());
+ unsigned Flags = Section.getFlags();
if (Sym.isTemporary()) {
// .BTF.ext generates FK_Data_4 relocations for
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
index a0011e8..fa9007e 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
@@ -16,7 +16,6 @@
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
index 7b21684..63d6e6f 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
@@ -13,18 +13,19 @@
#ifndef LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCASMINFO_H
#define LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCASMINFO_H
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
#include "llvm/TargetParser/Triple.h"
namespace llvm {
-class BPFMCAsmInfo : public MCAsmInfo {
+class BPFMCAsmInfo : public MCAsmInfoELF {
public:
explicit BPFMCAsmInfo(const Triple &TT, const MCTargetOptions &Options) {
if (TT.getArch() == Triple::bpfeb)
IsLittleEndian = false;
PrivateGlobalPrefix = ".L";
+ PrivateLabelPrefix = "L";
WeakRefDirective = "\t.weak\t";
UsesELFSectionDirectiveForBSS = true;
diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index d9d9b36..feecfc0 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -301,41 +301,53 @@ bool DataScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) {
}
bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
- Value *PtrOperand = GEPI.getPointerOperand();
- Type *OrigGEPType = GEPI.getSourceElementType();
- Type *NewGEPType = OrigGEPType;
+ GEPOperator *GOp = cast<GEPOperator>(&GEPI);
+ Value *PtrOperand = GOp->getPointerOperand();
+ Type *NewGEPType = GOp->getSourceElementType();
bool NeedsTransform = false;
+ // Unwrap GEP ConstantExprs to find the base operand and element type
+ while (auto *CE = dyn_cast<ConstantExpr>(PtrOperand)) {
+ if (auto *GEPCE = dyn_cast<GEPOperator>(CE)) {
+ GOp = GEPCE;
+ PtrOperand = GEPCE->getPointerOperand();
+ NewGEPType = GEPCE->getSourceElementType();
+ } else
+ break;
+ }
+
if (GlobalVariable *NewGlobal = lookupReplacementGlobal(PtrOperand)) {
NewGEPType = NewGlobal->getValueType();
PtrOperand = NewGlobal;
NeedsTransform = true;
} else if (AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrOperand)) {
Type *AllocatedType = Alloca->getAllocatedType();
- // Only transform if the allocated type is an array
- if (AllocatedType != OrigGEPType && isa<ArrayType>(AllocatedType)) {
+ if (isa<ArrayType>(AllocatedType) &&
+ AllocatedType != GOp->getResultElementType()) {
NewGEPType = AllocatedType;
NeedsTransform = true;
}
}
- // Scalar geps should remain scalars geps. The dxil-flatten-arrays pass will
- // convert these scalar geps into flattened array geps
- if (!isa<ArrayType>(OrigGEPType))
- NewGEPType = OrigGEPType;
-
- // Note: We bail if this isn't a gep touched via alloca or global
- // transformations
if (!NeedsTransform)
return false;
- IRBuilder<> Builder(&GEPI);
- SmallVector<Value *, MaxVecSize> Indices(GEPI.indices());
+ // Keep scalar GEPs scalar; dxil-flatten-arrays will do flattening later
+ if (!isa<ArrayType>(GOp->getSourceElementType()))
+ NewGEPType = GOp->getSourceElementType();
+ IRBuilder<> Builder(&GEPI);
+ SmallVector<Value *, MaxVecSize> Indices(GOp->indices());
Value *NewGEP = Builder.CreateGEP(NewGEPType, PtrOperand, Indices,
- GEPI.getName(), GEPI.getNoWrapFlags());
- GEPI.replaceAllUsesWith(NewGEP);
- GEPI.eraseFromParent();
+ GOp->getName(), GOp->getNoWrapFlags());
+
+ GOp->replaceAllUsesWith(NewGEP);
+
+ if (auto *CE = dyn_cast<ConstantExpr>(GOp))
+ CE->destroyConstant();
+ else if (auto *OldGEPI = dyn_cast<GetElementPtrInst>(GOp))
+ OldGEPI->eraseFromParent();
+
return true;
}
diff --git a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
index f0e2e78..7e1436e 100644
--- a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
+++ b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
@@ -263,8 +263,13 @@ bool DXILFlattenArraysVisitor::visitGetElementPtrInst(GetElementPtrInst &GEP) {
// merge the byte offsets. Otherwise, this GEP is itself the root of a GEP
// chain and we need to deterine the root array type
if (auto *PtrOpGEP = dyn_cast<GEPOperator>(PtrOperand)) {
- assert(GEPChainInfoMap.contains(PtrOpGEP) &&
- "Expected parent GEP to be visited before this GEP");
+
+ // If the parent GEP was not processed, then we do not want to process its
+ // descendants. This can happen if the GEP chain is for an unsupported type
+ // such as a struct -- we do not flatten structs nor GEP chains for structs
+ if (!GEPChainInfoMap.contains(PtrOpGEP))
+ return false;
+
GEPInfo &PGEPInfo = GEPChainInfoMap[PtrOpGEP];
Info.RootFlattenedArrayType = PGEPInfo.RootFlattenedArrayType;
Info.RootPointerOperand = PGEPInfo.RootPointerOperand;
diff --git a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
index c73648f..3427968 100644
--- a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
+++ b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
@@ -24,18 +24,19 @@
using namespace llvm;
-static void legalizeFreeze(Instruction &I,
+static bool legalizeFreeze(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *>) {
auto *FI = dyn_cast<FreezeInst>(&I);
if (!FI)
- return;
+ return false;
FI->replaceAllUsesWith(FI->getOperand(0));
ToRemove.push_back(FI);
+ return true;
}
-static void fixI8UseChain(Instruction &I,
+static bool fixI8UseChain(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &ReplacedValues) {
@@ -74,19 +75,19 @@ static void fixI8UseChain(Instruction &I,
if (Trunc->getDestTy()->isIntegerTy(8)) {
ReplacedValues[Trunc] = Trunc->getOperand(0);
ToRemove.push_back(Trunc);
- return;
+ return true;
}
}
if (auto *Store = dyn_cast<StoreInst>(&I)) {
if (!Store->getValueOperand()->getType()->isIntegerTy(8))
- return;
+ return false;
SmallVector<Value *> NewOperands;
ProcessOperands(NewOperands);
Value *NewStore = Builder.CreateStore(NewOperands[0], NewOperands[1]);
ReplacedValues[Store] = NewStore;
ToRemove.push_back(Store);
- return;
+ return true;
}
if (auto *Load = dyn_cast<LoadInst>(&I);
@@ -104,17 +105,17 @@ static void fixI8UseChain(Instruction &I,
LoadInst *NewLoad = Builder.CreateLoad(ElementType, NewOperands[0]);
ReplacedValues[Load] = NewLoad;
ToRemove.push_back(Load);
- return;
+ return true;
}
if (auto *Load = dyn_cast<LoadInst>(&I);
Load && isa<ConstantExpr>(Load->getPointerOperand())) {
auto *CE = dyn_cast<ConstantExpr>(Load->getPointerOperand());
if (!(CE->getOpcode() == Instruction::GetElementPtr))
- return;
+ return false;
auto *GEP = dyn_cast<GEPOperator>(CE);
if (!GEP->getSourceElementType()->isIntegerTy(8))
- return;
+ return false;
Type *ElementType = Load->getType();
ConstantInt *Offset = dyn_cast<ConstantInt>(GEP->getOperand(1));
@@ -143,12 +144,12 @@ static void fixI8UseChain(Instruction &I,
ReplacedValues[Load] = NewLoad;
Load->replaceAllUsesWith(NewLoad);
ToRemove.push_back(Load);
- return;
+ return true;
}
if (auto *BO = dyn_cast<BinaryOperator>(&I)) {
if (!I.getType()->isIntegerTy(8))
- return;
+ return false;
SmallVector<Value *> NewOperands;
ProcessOperands(NewOperands);
Value *NewInst =
@@ -162,24 +163,24 @@ static void fixI8UseChain(Instruction &I,
}
ReplacedValues[BO] = NewInst;
ToRemove.push_back(BO);
- return;
+ return true;
}
if (auto *Sel = dyn_cast<SelectInst>(&I)) {
if (!I.getType()->isIntegerTy(8))
- return;
+ return false;
SmallVector<Value *> NewOperands;
ProcessOperands(NewOperands);
Value *NewInst = Builder.CreateSelect(Sel->getCondition(), NewOperands[1],
NewOperands[2]);
ReplacedValues[Sel] = NewInst;
ToRemove.push_back(Sel);
- return;
+ return true;
}
if (auto *Cmp = dyn_cast<CmpInst>(&I)) {
if (!Cmp->getOperand(0)->getType()->isIntegerTy(8))
- return;
+ return false;
SmallVector<Value *> NewOperands;
ProcessOperands(NewOperands);
Value *NewInst =
@@ -187,18 +188,18 @@ static void fixI8UseChain(Instruction &I,
Cmp->replaceAllUsesWith(NewInst);
ReplacedValues[Cmp] = NewInst;
ToRemove.push_back(Cmp);
- return;
+ return true;
}
if (auto *Cast = dyn_cast<CastInst>(&I)) {
if (!Cast->getSrcTy()->isIntegerTy(8))
- return;
+ return false;
ToRemove.push_back(Cast);
auto *Replacement = ReplacedValues[Cast->getOperand(0)];
if (Cast->getType() == Replacement->getType()) {
Cast->replaceAllUsesWith(Replacement);
- return;
+ return true;
}
Value *AdjustedCast = nullptr;
@@ -213,7 +214,7 @@ static void fixI8UseChain(Instruction &I,
if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
if (!GEP->getType()->isPointerTy() ||
!GEP->getSourceElementType()->isIntegerTy(8))
- return;
+ return false;
Value *BasePtr = GEP->getPointerOperand();
if (ReplacedValues.count(BasePtr))
@@ -248,15 +249,17 @@ static void fixI8UseChain(Instruction &I,
ReplacedValues[GEP] = NewGEP;
GEP->replaceAllUsesWith(NewGEP);
ToRemove.push_back(GEP);
+ return true;
}
+ return false;
}
-static void upcastI8AllocasAndUses(Instruction &I,
+static bool upcastI8AllocasAndUses(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &ReplacedValues) {
auto *AI = dyn_cast<AllocaInst>(&I);
if (!AI || !AI->getAllocatedType()->isIntegerTy(8))
- return;
+ return false;
Type *SmallestType = nullptr;
@@ -291,16 +294,17 @@ static void upcastI8AllocasAndUses(Instruction &I,
}
if (!SmallestType)
- return; // no valid casts found
+ return false; // no valid casts found
// Replace alloca
IRBuilder<> Builder(AI);
auto *NewAlloca = Builder.CreateAlloca(SmallestType);
ReplacedValues[AI] = NewAlloca;
ToRemove.push_back(AI);
+ return true;
}
-static void
+static bool
downcastI64toI32InsertExtractElements(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &) {
@@ -318,6 +322,7 @@ downcastI64toI32InsertExtractElements(Instruction &I,
Extract->replaceAllUsesWith(NewExtract);
ToRemove.push_back(Extract);
+ return true;
}
}
@@ -335,8 +340,10 @@ downcastI64toI32InsertExtractElements(Instruction &I,
Insert->replaceAllUsesWith(Insert32Index);
ToRemove.push_back(Insert);
+ return true;
}
}
+ return false;
}
static void emitMemcpyExpansion(IRBuilder<> &Builder, Value *Dst, Value *Src,
@@ -453,17 +460,17 @@ static void emitMemsetExpansion(IRBuilder<> &Builder, Value *Dst, Value *Val,
// Expands the instruction `I` into corresponding loads and stores if it is a
// memcpy call. In that case, the call instruction is added to the `ToRemove`
// vector. `ReplacedValues` is unused.
-static void legalizeMemCpy(Instruction &I,
+static bool legalizeMemCpy(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &ReplacedValues) {
CallInst *CI = dyn_cast<CallInst>(&I);
if (!CI)
- return;
+ return false;
Intrinsic::ID ID = CI->getIntrinsicID();
if (ID != Intrinsic::memcpy)
- return;
+ return false;
IRBuilder<> Builder(&I);
Value *Dst = CI->getArgOperand(0);
@@ -476,19 +483,20 @@ static void legalizeMemCpy(Instruction &I,
assert(IsVolatile->getZExtValue() == 0 && "Expected IsVolatile to be false");
emitMemcpyExpansion(Builder, Dst, Src, Length);
ToRemove.push_back(CI);
+ return true;
}
-static void legalizeMemSet(Instruction &I,
+static bool legalizeMemSet(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &ReplacedValues) {
CallInst *CI = dyn_cast<CallInst>(&I);
if (!CI)
- return;
+ return false;
Intrinsic::ID ID = CI->getIntrinsicID();
if (ID != Intrinsic::memset)
- return;
+ return false;
IRBuilder<> Builder(&I);
Value *Dst = CI->getArgOperand(0);
@@ -497,23 +505,25 @@ static void legalizeMemSet(Instruction &I,
assert(Size && "Expected Size to be a ConstantInt");
emitMemsetExpansion(Builder, Dst, Val, Size, ReplacedValues);
ToRemove.push_back(CI);
+ return true;
}
-static void updateFnegToFsub(Instruction &I,
+static bool updateFnegToFsub(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &) {
const Intrinsic::ID ID = I.getOpcode();
if (ID != Instruction::FNeg)
- return;
+ return false;
IRBuilder<> Builder(&I);
Value *In = I.getOperand(0);
Value *Zero = ConstantFP::get(In->getType(), -0.0);
I.replaceAllUsesWith(Builder.CreateFSub(Zero, In));
ToRemove.push_back(&I);
+ return true;
}
-static void
+static bool
legalizeGetHighLowi64Bytes(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &ReplacedValues) {
@@ -523,13 +533,13 @@ legalizeGetHighLowi64Bytes(Instruction &I,
BitCast->getSrcTy()->isIntegerTy(64)) {
ToRemove.push_back(BitCast);
ReplacedValues[BitCast] = BitCast->getOperand(0);
- return;
+ return true;
}
}
if (auto *Extract = dyn_cast<ExtractElementInst>(&I)) {
if (!dyn_cast<BitCastInst>(Extract->getVectorOperand()))
- return;
+ return false;
auto *VecTy = dyn_cast<FixedVectorType>(Extract->getVectorOperandType());
if (VecTy && VecTy->getElementType()->isIntegerTy(32) &&
VecTy->getNumElements() == 2) {
@@ -557,12 +567,14 @@ legalizeGetHighLowi64Bytes(Instruction &I,
}
ToRemove.push_back(Extract);
Extract->replaceAllUsesWith(ReplacedValues[Extract]);
+ return true;
}
}
}
+ return false;
}
-static void
+static bool
legalizeScalarLoadStoreOnArrays(Instruction &I,
SmallVectorImpl<Instruction *> &ToRemove,
DenseMap<Value *, Value *> &) {
@@ -579,14 +591,14 @@ legalizeScalarLoadStoreOnArrays(Instruction &I,
PtrOpIndex = SI->getPointerOperandIndex();
LoadStoreTy = SI->getValueOperand()->getType();
} else
- return;
+ return false;
// If the load/store is not of a single-value type (i.e., scalar or vector)
// then we do not modify it. It shouldn't be a vector either because the
// dxil-data-scalarization pass is expected to run before this, but it's not
// incorrect to apply this transformation to vector load/stores.
if (!LoadStoreTy->isSingleValueType())
- return;
+ return false;
Type *ArrayTy;
if (auto *GlobalVarPtrOp = dyn_cast<GlobalVariable>(PtrOp))
@@ -594,10 +606,10 @@ legalizeScalarLoadStoreOnArrays(Instruction &I,
else if (auto *AllocaPtrOp = dyn_cast<AllocaInst>(PtrOp))
ArrayTy = AllocaPtrOp->getAllocatedType();
else
- return;
+ return false;
if (!isa<ArrayType>(ArrayTy))
- return;
+ return false;
assert(ArrayTy->getArrayElementType() == LoadStoreTy &&
"Expected array element type to be the same as to the scalar load or "
@@ -607,6 +619,7 @@ legalizeScalarLoadStoreOnArrays(Instruction &I,
Value *GEP = GetElementPtrInst::Create(
ArrayTy, PtrOp, {Zero, Zero}, GEPNoWrapFlags::all(), "", I.getIterator());
I.setOperand(PtrOpIndex, GEP);
+ return true;
}
namespace {
@@ -624,13 +637,11 @@ public:
ReplacedValues.clear();
for (auto &I : instructions(F)) {
for (auto &LegalizationFn : LegalizationPipeline[Stage])
- LegalizationFn(I, ToRemove, ReplacedValues);
+ MadeChange |= LegalizationFn(I, ToRemove, ReplacedValues);
}
for (auto *Inst : reverse(ToRemove))
Inst->eraseFromParent();
-
- MadeChange |= !ToRemove.empty();
}
return MadeChange;
}
@@ -639,7 +650,7 @@ private:
enum LegalizationStage { Stage1 = 0, Stage2 = 1, NumStages };
using LegalizationFnTy =
- std::function<void(Instruction &, SmallVectorImpl<Instruction *> &,
+ std::function<bool(Instruction &, SmallVectorImpl<Instruction *> &,
DenseMap<Value *, Value *> &)>;
SmallVector<LegalizationFnTy> LegalizationPipeline[NumStages];
diff --git a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
index 566f3a9..c33ec0e 100644
--- a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
+++ b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
@@ -241,7 +241,6 @@ static void replaceAccess(IntrinsicInst *II, dxil::ResourceTypeInfo &RTI) {
}
static bool transformResourcePointers(Function &F, DXILResourceTypeMap &DRTM) {
- bool Changed = false;
SmallVector<std::pair<IntrinsicInst *, dxil::ResourceTypeInfo>> Resources;
for (BasicBlock &BB : F)
for (Instruction &I : BB)
@@ -254,7 +253,7 @@ static bool transformResourcePointers(Function &F, DXILResourceTypeMap &DRTM) {
for (auto &[II, RI] : Resources)
replaceAccess(II, RI);
- return Changed;
+ return !Resources.empty();
}
PreservedAnalyses DXILResourceAccess::run(Function &F,
diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
index dfc8162..ebdfcaa 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
@@ -16,6 +16,7 @@
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/DXILMetadataAnalysis.h"
#include "llvm/BinaryFormat/DXContainer.h"
+#include "llvm/Frontend/HLSL/RootSignatureMetadata.h"
#include "llvm/Frontend/HLSL/RootSignatureValidations.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DiagnosticInfo.h"
@@ -29,25 +30,10 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include <cstdint>
-#include <optional>
-#include <utility>
using namespace llvm;
using namespace llvm::dxil;
-static bool reportError(LLVMContext *Ctx, Twine Message,
- DiagnosticSeverity Severity = DS_Error) {
- Ctx->diagnose(DiagnosticInfoGeneric(Message, Severity));
- return true;
-}
-
-static bool reportValueError(LLVMContext *Ctx, Twine ParamName,
- uint32_t Value) {
- Ctx->diagnose(DiagnosticInfoGeneric(
- "Invalid value for " + ParamName + ": " + Twine(Value), DS_Error));
- return true;
-}
-
static std::optional<uint32_t> extractMdIntValue(MDNode *Node,
unsigned int OpId) {
if (auto *CI =
@@ -56,453 +42,10 @@ static std::optional<uint32_t> extractMdIntValue(MDNode *Node,
return std::nullopt;
}
-static std::optional<float> extractMdFloatValue(MDNode *Node,
- unsigned int OpId) {
- if (auto *CI = mdconst::dyn_extract<ConstantFP>(Node->getOperand(OpId).get()))
- return CI->getValueAPF().convertToFloat();
- return std::nullopt;
-}
-
-static std::optional<StringRef> extractMdStringValue(MDNode *Node,
- unsigned int OpId) {
- MDString *NodeText = dyn_cast<MDString>(Node->getOperand(OpId));
- if (NodeText == nullptr)
- return std::nullopt;
- return NodeText->getString();
-}
-
-static bool parseRootFlags(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
- MDNode *RootFlagNode) {
-
- if (RootFlagNode->getNumOperands() != 2)
- return reportError(Ctx, "Invalid format for RootFlag Element");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootFlagNode, 1))
- RSD.Flags = *Val;
- else
- return reportError(Ctx, "Invalid value for RootFlag");
-
- return false;
-}
-
-static bool parseRootConstants(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
- MDNode *RootConstantNode) {
-
- if (RootConstantNode->getNumOperands() != 5)
- return reportError(Ctx, "Invalid format for RootConstants Element");
-
- dxbc::RTS0::v1::RootParameterHeader Header;
- // The parameter offset doesn't matter here - we recalculate it during
- // serialization Header.ParameterOffset = 0;
- Header.ParameterType =
- llvm::to_underlying(dxbc::RootParameterType::Constants32Bit);
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 1))
- Header.ShaderVisibility = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderVisibility");
-
- dxbc::RTS0::v1::RootConstants Constants;
- if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 2))
- Constants.ShaderRegister = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderRegister");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 3))
- Constants.RegisterSpace = *Val;
- else
- return reportError(Ctx, "Invalid value for RegisterSpace");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 4))
- Constants.Num32BitValues = *Val;
- else
- return reportError(Ctx, "Invalid value for Num32BitValues");
-
- RSD.ParametersContainer.addParameter(Header, Constants);
-
- return false;
-}
-
-static bool parseRootDescriptors(LLVMContext *Ctx,
- mcdxbc::RootSignatureDesc &RSD,
- MDNode *RootDescriptorNode,
- RootSignatureElementKind ElementKind) {
- assert(ElementKind == RootSignatureElementKind::SRV ||
- ElementKind == RootSignatureElementKind::UAV ||
- ElementKind == RootSignatureElementKind::CBV &&
- "parseRootDescriptors should only be called with RootDescriptor "
- "element kind.");
- if (RootDescriptorNode->getNumOperands() != 5)
- return reportError(Ctx, "Invalid format for Root Descriptor Element");
-
- dxbc::RTS0::v1::RootParameterHeader Header;
- switch (ElementKind) {
- case RootSignatureElementKind::SRV:
- Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::SRV);
- break;
- case RootSignatureElementKind::UAV:
- Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::UAV);
- break;
- case RootSignatureElementKind::CBV:
- Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::CBV);
- break;
- default:
- llvm_unreachable("invalid Root Descriptor kind");
- break;
- }
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 1))
- Header.ShaderVisibility = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderVisibility");
-
- dxbc::RTS0::v2::RootDescriptor Descriptor;
- if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 2))
- Descriptor.ShaderRegister = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderRegister");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 3))
- Descriptor.RegisterSpace = *Val;
- else
- return reportError(Ctx, "Invalid value for RegisterSpace");
-
- if (RSD.Version == 1) {
- RSD.ParametersContainer.addParameter(Header, Descriptor);
- return false;
- }
- assert(RSD.Version > 1);
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 4))
- Descriptor.Flags = *Val;
- else
- return reportError(Ctx, "Invalid value for Root Descriptor Flags");
-
- RSD.ParametersContainer.addParameter(Header, Descriptor);
- return false;
-}
-
-static bool parseDescriptorRange(LLVMContext *Ctx,
- mcdxbc::DescriptorTable &Table,
- MDNode *RangeDescriptorNode) {
-
- if (RangeDescriptorNode->getNumOperands() != 6)
- return reportError(Ctx, "Invalid format for Descriptor Range");
-
- dxbc::RTS0::v2::DescriptorRange Range;
-
- std::optional<StringRef> ElementText =
- extractMdStringValue(RangeDescriptorNode, 0);
-
- if (!ElementText.has_value())
- return reportError(Ctx, "Descriptor Range, first element is not a string.");
-
- Range.RangeType =
- StringSwitch<uint32_t>(*ElementText)
- .Case("CBV", llvm::to_underlying(dxbc::DescriptorRangeType::CBV))
- .Case("SRV", llvm::to_underlying(dxbc::DescriptorRangeType::SRV))
- .Case("UAV", llvm::to_underlying(dxbc::DescriptorRangeType::UAV))
- .Case("Sampler",
- llvm::to_underlying(dxbc::DescriptorRangeType::Sampler))
- .Default(~0U);
-
- if (Range.RangeType == ~0U)
- return reportError(Ctx, "Invalid Descriptor Range type: " + *ElementText);
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 1))
- Range.NumDescriptors = *Val;
- else
- return reportError(Ctx, "Invalid value for Number of Descriptor in Range");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 2))
- Range.BaseShaderRegister = *Val;
- else
- return reportError(Ctx, "Invalid value for BaseShaderRegister");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 3))
- Range.RegisterSpace = *Val;
- else
- return reportError(Ctx, "Invalid value for RegisterSpace");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 4))
- Range.OffsetInDescriptorsFromTableStart = *Val;
- else
- return reportError(Ctx,
- "Invalid value for OffsetInDescriptorsFromTableStart");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 5))
- Range.Flags = *Val;
- else
- return reportError(Ctx, "Invalid value for Descriptor Range Flags");
-
- Table.Ranges.push_back(Range);
- return false;
-}
-
-static bool parseDescriptorTable(LLVMContext *Ctx,
- mcdxbc::RootSignatureDesc &RSD,
- MDNode *DescriptorTableNode) {
- const unsigned int NumOperands = DescriptorTableNode->getNumOperands();
- if (NumOperands < 2)
- return reportError(Ctx, "Invalid format for Descriptor Table");
-
- dxbc::RTS0::v1::RootParameterHeader Header;
- if (std::optional<uint32_t> Val = extractMdIntValue(DescriptorTableNode, 1))
- Header.ShaderVisibility = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderVisibility");
-
- mcdxbc::DescriptorTable Table;
- Header.ParameterType =
- llvm::to_underlying(dxbc::RootParameterType::DescriptorTable);
-
- for (unsigned int I = 2; I < NumOperands; I++) {
- MDNode *Element = dyn_cast<MDNode>(DescriptorTableNode->getOperand(I));
- if (Element == nullptr)
- return reportError(Ctx, "Missing Root Element Metadata Node.");
-
- if (parseDescriptorRange(Ctx, Table, Element))
- return true;
- }
-
- RSD.ParametersContainer.addParameter(Header, Table);
- return false;
-}
-
-static bool parseStaticSampler(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
- MDNode *StaticSamplerNode) {
- if (StaticSamplerNode->getNumOperands() != 14)
- return reportError(Ctx, "Invalid format for Static Sampler");
-
- dxbc::RTS0::v1::StaticSampler Sampler;
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 1))
- Sampler.Filter = *Val;
- else
- return reportError(Ctx, "Invalid value for Filter");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 2))
- Sampler.AddressU = *Val;
- else
- return reportError(Ctx, "Invalid value for AddressU");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 3))
- Sampler.AddressV = *Val;
- else
- return reportError(Ctx, "Invalid value for AddressV");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 4))
- Sampler.AddressW = *Val;
- else
- return reportError(Ctx, "Invalid value for AddressW");
-
- if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 5))
- Sampler.MipLODBias = *Val;
- else
- return reportError(Ctx, "Invalid value for MipLODBias");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 6))
- Sampler.MaxAnisotropy = *Val;
- else
- return reportError(Ctx, "Invalid value for MaxAnisotropy");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 7))
- Sampler.ComparisonFunc = *Val;
- else
- return reportError(Ctx, "Invalid value for ComparisonFunc ");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 8))
- Sampler.BorderColor = *Val;
- else
- return reportError(Ctx, "Invalid value for ComparisonFunc ");
-
- if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 9))
- Sampler.MinLOD = *Val;
- else
- return reportError(Ctx, "Invalid value for MinLOD");
-
- if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 10))
- Sampler.MaxLOD = *Val;
- else
- return reportError(Ctx, "Invalid value for MaxLOD");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 11))
- Sampler.ShaderRegister = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderRegister");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 12))
- Sampler.RegisterSpace = *Val;
- else
- return reportError(Ctx, "Invalid value for RegisterSpace");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 13))
- Sampler.ShaderVisibility = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderVisibility");
-
- RSD.StaticSamplers.push_back(Sampler);
- return false;
-}
-
-static bool parseRootSignatureElement(LLVMContext *Ctx,
- mcdxbc::RootSignatureDesc &RSD,
- MDNode *Element) {
- std::optional<StringRef> ElementText = extractMdStringValue(Element, 0);
- if (!ElementText.has_value())
- return reportError(Ctx, "Invalid format for Root Element");
-
- RootSignatureElementKind ElementKind =
- StringSwitch<RootSignatureElementKind>(*ElementText)
- .Case("RootFlags", RootSignatureElementKind::RootFlags)
- .Case("RootConstants", RootSignatureElementKind::RootConstants)
- .Case("RootCBV", RootSignatureElementKind::CBV)
- .Case("RootSRV", RootSignatureElementKind::SRV)
- .Case("RootUAV", RootSignatureElementKind::UAV)
- .Case("DescriptorTable", RootSignatureElementKind::DescriptorTable)
- .Case("StaticSampler", RootSignatureElementKind::StaticSamplers)
- .Default(RootSignatureElementKind::Error);
-
- switch (ElementKind) {
-
- case RootSignatureElementKind::RootFlags:
- return parseRootFlags(Ctx, RSD, Element);
- case RootSignatureElementKind::RootConstants:
- return parseRootConstants(Ctx, RSD, Element);
- case RootSignatureElementKind::CBV:
- case RootSignatureElementKind::SRV:
- case RootSignatureElementKind::UAV:
- return parseRootDescriptors(Ctx, RSD, Element, ElementKind);
- case RootSignatureElementKind::DescriptorTable:
- return parseDescriptorTable(Ctx, RSD, Element);
- case RootSignatureElementKind::StaticSamplers:
- return parseStaticSampler(Ctx, RSD, Element);
- case RootSignatureElementKind::Error:
- return reportError(Ctx, "Invalid Root Signature Element: " + *ElementText);
- }
-
- llvm_unreachable("Unhandled RootSignatureElementKind enum.");
-}
-
-static bool parse(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
- MDNode *Node) {
- bool HasError = false;
-
- // Loop through the Root Elements of the root signature.
- for (const auto &Operand : Node->operands()) {
- MDNode *Element = dyn_cast<MDNode>(Operand);
- if (Element == nullptr)
- return reportError(Ctx, "Missing Root Element Metadata Node.");
-
- HasError = HasError || parseRootSignatureElement(Ctx, RSD, Element);
- }
-
- return HasError;
-}
-
-static bool validate(LLVMContext *Ctx, const mcdxbc::RootSignatureDesc &RSD) {
-
- if (!llvm::hlsl::rootsig::verifyVersion(RSD.Version)) {
- return reportValueError(Ctx, "Version", RSD.Version);
- }
-
- if (!llvm::hlsl::rootsig::verifyRootFlag(RSD.Flags)) {
- return reportValueError(Ctx, "RootFlags", RSD.Flags);
- }
-
- for (const mcdxbc::RootParameterInfo &Info : RSD.ParametersContainer) {
- if (!dxbc::isValidShaderVisibility(Info.Header.ShaderVisibility))
- return reportValueError(Ctx, "ShaderVisibility",
- Info.Header.ShaderVisibility);
-
- assert(dxbc::isValidParameterType(Info.Header.ParameterType) &&
- "Invalid value for ParameterType");
-
- switch (Info.Header.ParameterType) {
-
- case llvm::to_underlying(dxbc::RootParameterType::CBV):
- case llvm::to_underlying(dxbc::RootParameterType::UAV):
- case llvm::to_underlying(dxbc::RootParameterType::SRV): {
- const dxbc::RTS0::v2::RootDescriptor &Descriptor =
- RSD.ParametersContainer.getRootDescriptor(Info.Location);
- if (!llvm::hlsl::rootsig::verifyRegisterValue(Descriptor.ShaderRegister))
- return reportValueError(Ctx, "ShaderRegister",
- Descriptor.ShaderRegister);
-
- if (!llvm::hlsl::rootsig::verifyRegisterSpace(Descriptor.RegisterSpace))
- return reportValueError(Ctx, "RegisterSpace", Descriptor.RegisterSpace);
-
- if (RSD.Version > 1) {
- if (!llvm::hlsl::rootsig::verifyRootDescriptorFlag(RSD.Version,
- Descriptor.Flags))
- return reportValueError(Ctx, "RootDescriptorFlag", Descriptor.Flags);
- }
- break;
- }
- case llvm::to_underlying(dxbc::RootParameterType::DescriptorTable): {
- const mcdxbc::DescriptorTable &Table =
- RSD.ParametersContainer.getDescriptorTable(Info.Location);
- for (const dxbc::RTS0::v2::DescriptorRange &Range : Table) {
- if (!llvm::hlsl::rootsig::verifyRangeType(Range.RangeType))
- return reportValueError(Ctx, "RangeType", Range.RangeType);
-
- if (!llvm::hlsl::rootsig::verifyRegisterSpace(Range.RegisterSpace))
- return reportValueError(Ctx, "RegisterSpace", Range.RegisterSpace);
-
- if (!llvm::hlsl::rootsig::verifyNumDescriptors(Range.NumDescriptors))
- return reportValueError(Ctx, "NumDescriptors", Range.NumDescriptors);
-
- if (!llvm::hlsl::rootsig::verifyDescriptorRangeFlag(
- RSD.Version, Range.RangeType, Range.Flags))
- return reportValueError(Ctx, "DescriptorFlag", Range.Flags);
- }
- break;
- }
- }
- }
-
- for (const dxbc::RTS0::v1::StaticSampler &Sampler : RSD.StaticSamplers) {
- if (!llvm::hlsl::rootsig::verifySamplerFilter(Sampler.Filter))
- return reportValueError(Ctx, "Filter", Sampler.Filter);
-
- if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressU))
- return reportValueError(Ctx, "AddressU", Sampler.AddressU);
-
- if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressV))
- return reportValueError(Ctx, "AddressV", Sampler.AddressV);
-
- if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressW))
- return reportValueError(Ctx, "AddressW", Sampler.AddressW);
-
- if (!llvm::hlsl::rootsig::verifyMipLODBias(Sampler.MipLODBias))
- return reportValueError(Ctx, "MipLODBias", Sampler.MipLODBias);
-
- if (!llvm::hlsl::rootsig::verifyMaxAnisotropy(Sampler.MaxAnisotropy))
- return reportValueError(Ctx, "MaxAnisotropy", Sampler.MaxAnisotropy);
-
- if (!llvm::hlsl::rootsig::verifyComparisonFunc(Sampler.ComparisonFunc))
- return reportValueError(Ctx, "ComparisonFunc", Sampler.ComparisonFunc);
-
- if (!llvm::hlsl::rootsig::verifyBorderColor(Sampler.BorderColor))
- return reportValueError(Ctx, "BorderColor", Sampler.BorderColor);
-
- if (!llvm::hlsl::rootsig::verifyLOD(Sampler.MinLOD))
- return reportValueError(Ctx, "MinLOD", Sampler.MinLOD);
-
- if (!llvm::hlsl::rootsig::verifyLOD(Sampler.MaxLOD))
- return reportValueError(Ctx, "MaxLOD", Sampler.MaxLOD);
-
- if (!llvm::hlsl::rootsig::verifyRegisterValue(Sampler.ShaderRegister))
- return reportValueError(Ctx, "ShaderRegister", Sampler.ShaderRegister);
-
- if (!llvm::hlsl::rootsig::verifyRegisterSpace(Sampler.RegisterSpace))
- return reportValueError(Ctx, "RegisterSpace", Sampler.RegisterSpace);
-
- if (!dxbc::isValidShaderVisibility(Sampler.ShaderVisibility))
- return reportValueError(Ctx, "ShaderVisibility",
- Sampler.ShaderVisibility);
- }
-
- return false;
+static bool reportError(LLVMContext *Ctx, Twine Message,
+ DiagnosticSeverity Severity = DS_Error) {
+ Ctx->diagnose(DiagnosticInfoGeneric(Message, Severity));
+ return true;
}
static SmallDenseMap<const Function *, mcdxbc::RootSignatureDesc>
@@ -584,7 +127,9 @@ analyzeModule(Module &M) {
// static sampler offset is calculated when writting dxcontainer.
RSD.StaticSamplersOffset = 0u;
- if (parse(Ctx, RSD, RootElementListNode) || validate(Ctx, RSD)) {
+ hlsl::rootsig::MetadataParser MDParser(RootElementListNode);
+
+ if (MDParser.ParseRootSignature(Ctx, RSD)) {
return RSDMap;
}
diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h
index fc39b38..254b7ff 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.h
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.h
@@ -26,17 +26,6 @@
namespace llvm {
namespace dxil {
-enum class RootSignatureElementKind {
- Error = 0,
- RootFlags = 1,
- RootConstants = 2,
- SRV = 3,
- UAV = 4,
- CBV = 5,
- DescriptorTable = 6,
- StaticSamplers = 7
-};
-
class RootSignatureBindingInfo {
private:
SmallDenseMap<const Function *, mcdxbc::RootSignatureDesc> FuncToRsMap;
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
index eb4adfe..e7e7f2c 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
@@ -106,11 +106,11 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF,
DXILResourceTypeMap &DRTM,
const ModuleMetadataInfo &MMDI) {
if (!CSF.Doubles)
- CSF.Doubles = I.getType()->isDoubleTy();
+ CSF.Doubles = I.getType()->getScalarType()->isDoubleTy();
if (!CSF.Doubles) {
for (const Value *Op : I.operands()) {
- if (Op->getType()->isDoubleTy()) {
+ if (Op->getType()->getScalarType()->isDoubleTy()) {
CSF.Doubles = true;
break;
}
@@ -130,12 +130,13 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF,
}
if (!CSF.LowPrecisionPresent)
- CSF.LowPrecisionPresent =
- I.getType()->isIntegerTy(16) || I.getType()->isHalfTy();
+ CSF.LowPrecisionPresent = I.getType()->getScalarType()->isIntegerTy(16) ||
+ I.getType()->getScalarType()->isHalfTy();
if (!CSF.LowPrecisionPresent) {
for (const Value *Op : I.operands()) {
- if (Op->getType()->isIntegerTy(16) || Op->getType()->isHalfTy()) {
+ if (Op->getType()->getScalarType()->isIntegerTy(16) ||
+ Op->getType()->getScalarType()->isHalfTy()) {
CSF.LowPrecisionPresent = true;
break;
}
@@ -150,11 +151,11 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF,
}
if (!CSF.Int64Ops)
- CSF.Int64Ops = I.getType()->isIntegerTy(64);
+ CSF.Int64Ops = I.getType()->getScalarType()->isIntegerTy(64);
if (!CSF.Int64Ops && !isa<LifetimeIntrinsic>(&I)) {
for (const Value *Op : I.operands()) {
- if (Op->getType()->isIntegerTy(64)) {
+ if (Op->getType()->getScalarType()->isIntegerTy(64)) {
CSF.Int64Ops = true;
break;
}
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index c2eb24b..c34eecd 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -38,7 +38,6 @@
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsHexagon.h"
#include "llvm/IR/Module.h"
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index e915a3c4..613cfb5 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2385,19 +2385,9 @@ SDValue LoongArchTargetLowering::lowerBF16_TO_FP(SDValue Op,
return Res;
}
-static bool isConstantOrUndef(const SDValue Op) {
- if (Op->isUndef())
- return true;
- if (isa<ConstantSDNode>(Op))
- return true;
- if (isa<ConstantFPSDNode>(Op))
- return true;
- return false;
-}
-
-static bool isConstantOrUndefBUILD_VECTOR(const BuildVectorSDNode *Op) {
+static bool isConstantBUILD_VECTOR(const BuildVectorSDNode *Op) {
for (unsigned i = 0; i < Op->getNumOperands(); ++i)
- if (isConstantOrUndef(Op->getOperand(i)))
+ if (isIntOrFPConstant(Op->getOperand(i)))
return true;
return false;
}
@@ -2505,20 +2495,23 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
if (DAG.isSplatValue(Op, /*AllowUndefs=*/false))
return Op;
- if (!isConstantOrUndefBUILD_VECTOR(Node)) {
+ if (!isConstantBUILD_VECTOR(Node)) {
// Use INSERT_VECTOR_ELT operations rather than expand to stores.
// The resulting code is the same length as the expansion, but it doesn't
// use memory operations.
- EVT ResTy = Node->getValueType(0);
-
assert(ResTy.isVector());
unsigned NumElts = ResTy.getVectorNumElements();
- SDValue Vector =
- DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ResTy, Node->getOperand(0));
+ SDValue Op0 = Node->getOperand(0);
+ SDValue Vector = DAG.getUNDEF(ResTy);
+
+ if (!Op0.isUndef())
+ Vector = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ResTy, Op0);
for (unsigned i = 1; i < NumElts; ++i) {
- Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector,
- Node->getOperand(i),
+ SDValue Opi = Node->getOperand(i);
+ if (Opi.isUndef())
+ continue;
+ Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector, Opi,
DAG.getConstant(i, DL, Subtarget.getGRLenVT()));
}
return Vector;
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
index 8fa72bc..d9ea88c 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -254,6 +254,7 @@ bool LoongArchAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) {
MCFixup Fixup =
MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN);
F.setVarFixups({Fixup});
+ F.setLinkerRelaxable();
F.getParent()->setLinkerRelaxable();
return true;
}
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index feb4eb3..d9680c7 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -969,7 +969,7 @@ void MipsTargetELFStreamer::finish() {
Align Alignment = Section.getAlign();
S.switchSection(&Section);
- if (Section.useCodeAlign())
+ if (getContext().getAsmInfo()->useCodeAlign(Section))
S.emitCodeAlignment(Alignment, &STI, Alignment.value());
else
S.emitValueToAlignment(Alignment, 0, 1, Alignment.value());
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
index 614b321..ce9cd12 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -15,8 +15,6 @@
using namespace llvm;
-void NVPTXMCAsmInfo::anchor() {}
-
NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple,
const MCTargetOptions &Options) {
if (TheTriple.getArch() == Triple::nvptx64) {
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
index 77c4dae..f071406 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
@@ -19,8 +19,6 @@ namespace llvm {
class Triple;
class NVPTXMCAsmInfo : public MCAsmInfo {
- virtual void anchor();
-
public:
explicit NVPTXMCAsmInfo(const Triple &TheTriple,
const MCTargetOptions &Options);
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
index 9f91143..329e3b5 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
@@ -97,10 +97,7 @@ void NVPTXTargetStreamer::changeSection(const MCSection *CurSection,
if (isDwarfSection(FI, Section)) {
// Emit DWARF .file directives in the outermost scope.
outputDwarfFileDirectives();
- OS << "\t.section";
- Section->printSwitchToSection(*getStreamer().getContext().getAsmInfo(),
- getStreamer().getContext().getTargetTriple(),
- OS, SubSection);
+ OS << "\t.section\t" << Section->getName() << '\n';
// DWARF sections are enclosed into braces - emit the open one.
OS << "\t{\n";
HasSections = true;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index f2c2f46..ddcecc00 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -952,10 +952,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// promoted to f32. v2f16 is expanded to f16, which is then promoted
// to f32.
for (const auto &Op :
- {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) {
+ {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FTANH}) {
setOperationAction(Op, MVT::f16, Promote);
setOperationAction(Op, MVT::f32, Legal);
- setOperationAction(Op, MVT::f64, Legal);
+ // only div/rem/sqrt are legal for f64
+ if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) {
+ setOperationAction(Op, MVT::f64, Legal);
+ }
setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand);
setOperationAction(Op, MVT::bf16, Promote);
AddPromotedToType(Op, MVT::bf16, MVT::f32);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index b5df4c6..442b900 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1234,7 +1234,7 @@ defm FMA_F32 : FMA<F32RT, allow_ftz = true>;
defm FMA_F32x2 : FMA<F32X2RT, allow_ftz = true, preds = [hasF32x2Instructions]>;
defm FMA_F64 : FMA<F64RT, allow_ftz = false>;
-// sin/cos
+// sin/cos/tanh
class UnaryOpAllowsApproxFn<SDPatternOperator operator>
: PatFrag<(ops node:$A),
@@ -1250,6 +1250,10 @@ def COS_APPROX_f32 :
BasicFlagsNVPTXInst<(outs B32:$dst), (ins B32:$src), (ins FTZFlag:$ftz),
"cos.approx$ftz.f32",
[(set f32:$dst, (UnaryOpAllowsApproxFn<fcos> f32:$src))]>;
+def TANH_APPROX_f32 :
+ BasicNVPTXInst<(outs B32:$dst), (ins B32:$src), "tanh.approx.f32",
+ [(set f32:$dst, (UnaryOpAllowsApproxFn<ftanh> f32:$src))]>,
+ Requires<[hasPTX<70>, hasSM<75>]>;
//-----------------------------------
// Bitwise operations
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index 8baf866..1af2f9c 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -220,8 +220,6 @@ bool PPCELFMCAsmInfo::evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr,
return evaluateAsRelocatable(Expr, Res, Asm);
}
-void PPCXCOFFMCAsmInfo::anchor() {}
-
PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) {
if (T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppcle)
report_fatal_error("XCOFF is not supported for little-endian targets");
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
index 0f945b3..6af1bd7 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -33,8 +33,6 @@ public:
};
class PPCXCOFFMCAsmInfo : public MCAsmInfoXCOFF {
- void anchor() override;
-
public:
explicit PPCXCOFFMCAsmInfo(bool is64Bit, const Triple &);
void printSpecifierExpr(raw_ostream &OS,
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 54497d9..3dad0e8 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -213,7 +213,7 @@ public:
void emitTCEntry(const MCSymbol &S, PPCMCExpr::Specifier Kind) override {
if (const MCSymbolXCOFF *XSym = dyn_cast<MCSymbolXCOFF>(&S)) {
MCSymbolXCOFF *TCSym =
- cast<MCSectionXCOFF>(Streamer.getCurrentSectionOnly())
+ static_cast<const MCSectionXCOFF *>(Streamer.getCurrentSectionOnly())
->getQualNameSymbol();
// On AIX, we have TLS variable offsets (symbol@({gd|ie|le|ld}) depending
// on the TLS access method (or model). For the general-dynamic access
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index a091b21..ce1d51a 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -2274,9 +2274,9 @@ void PPCAIXAsmPrinter::emitLinkage(const GlobalValue *GV,
void PPCAIXAsmPrinter::SetupMachineFunction(MachineFunction &MF) {
// Setup CurrentFnDescSym and its containing csect.
- MCSectionXCOFF *FnDescSec =
- cast<MCSectionXCOFF>(getObjFileLowering().getSectionForFunctionDescriptor(
- &MF.getFunction(), TM));
+ auto *FnDescSec = static_cast<MCSectionXCOFF *>(
+ getObjFileLowering().getSectionForFunctionDescriptor(&MF.getFunction(),
+ TM));
FnDescSec->setAlignment(Align(Subtarget->isPPC64() ? 8 : 4));
CurrentFnDescSym = FnDescSec->getQualNameSymbol();
@@ -2669,9 +2669,9 @@ void PPCAIXAsmPrinter::emitTracebackTable() {
MCSymbol *EHInfoSym =
TargetLoweringObjectFileXCOFF::getEHInfoTableSymbol(MF);
MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(EHInfoSym, TOCType_EHBlock);
- const MCSymbol *TOCBaseSym =
- cast<MCSectionXCOFF>(getObjFileLowering().getTOCBaseSection())
- ->getQualNameSymbol();
+ const MCSymbol *TOCBaseSym = static_cast<const MCSectionXCOFF *>(
+ getObjFileLowering().getTOCBaseSection())
+ ->getQualNameSymbol();
const MCExpr *Exp =
MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCEntry, Ctx),
MCSymbolRefExpr::create(TOCBaseSym, Ctx), Ctx);
@@ -2788,7 +2788,7 @@ void PPCAIXAsmPrinter::emitGlobalVariableHelper(const GlobalVariable *GV) {
}
}
- MCSectionXCOFF *Csect = cast<MCSectionXCOFF>(
+ auto *Csect = static_cast<MCSectionXCOFF *>(
getObjFileLowering().SectionForGlobal(GV, GVKind, TM));
// Switch to the containing csect.
@@ -2869,9 +2869,9 @@ void PPCAIXAsmPrinter::emitFunctionDescriptor() {
OutStreamer->emitValue(MCSymbolRefExpr::create(CurrentFnSym, OutContext),
PointerSize);
// Emit TOC base address.
- const MCSymbol *TOCBaseSym =
- cast<MCSectionXCOFF>(getObjFileLowering().getTOCBaseSection())
- ->getQualNameSymbol();
+ const MCSymbol *TOCBaseSym = static_cast<const MCSectionXCOFF *>(
+ getObjFileLowering().getTOCBaseSection())
+ ->getQualNameSymbol();
OutStreamer->emitValue(MCSymbolRefExpr::create(TOCBaseSym, OutContext),
PointerSize);
// Emit a null environment pointer.
@@ -2996,10 +2996,10 @@ void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) {
Name += Prefix;
Name += cast<MCSymbolXCOFF>(I.first.first)->getSymbolTableName();
MCSymbol *S = OutContext.getOrCreateSymbol(Name);
- TCEntry = cast<MCSectionXCOFF>(
+ TCEntry = static_cast<MCSectionXCOFF *>(
getObjFileLowering().getSectionForTOCEntry(S, TM));
} else {
- TCEntry = cast<MCSectionXCOFF>(
+ TCEntry = static_cast<MCSectionXCOFF *>(
getObjFileLowering().getSectionForTOCEntry(I.first.first, TM));
}
OutStreamer->switchSection(TCEntry);
@@ -3054,7 +3054,7 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) {
return;
SectionKind GOKind = getObjFileLowering().getKindForGlobal(GO, TM);
- MCSectionXCOFF *Csect = cast<MCSectionXCOFF>(
+ auto *Csect = static_cast<MCSectionXCOFF *>(
getObjFileLowering().SectionForGlobal(GO, GOKind, TM));
Align GOAlign = getGVAlignment(GO, GO->getDataLayout());
@@ -3316,9 +3316,9 @@ void PPCAIXAsmPrinter::emitTTypeReference(const GlobalValue *GV,
GlobalType = TOCType_GlobalExternal;
MCSymbol *TypeInfoSym = TM.getSymbol(GV);
MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(TypeInfoSym, GlobalType);
- const MCSymbol *TOCBaseSym =
- cast<MCSectionXCOFF>(getObjFileLowering().getTOCBaseSection())
- ->getQualNameSymbol();
+ const MCSymbol *TOCBaseSym = static_cast<const MCSectionXCOFF *>(
+ getObjFileLowering().getTOCBaseSection())
+ ->getQualNameSymbol();
auto &Ctx = OutStreamer->getContext();
const MCExpr *Exp =
MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCEntry, Ctx),
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index a143d85..d71c42c 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -3849,9 +3849,14 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
switch (Inst.getOpcode()) {
default:
break;
- case RISCV::PseudoC_ADDI_NOP:
- emitToStreamer(Out, MCInstBuilder(RISCV::C_NOP));
+ case RISCV::PseudoC_ADDI_NOP: {
+ if (Inst.getOperand(2).getImm() == 0)
+ emitToStreamer(Out, MCInstBuilder(RISCV::C_NOP));
+ else
+ emitToStreamer(
+ Out, MCInstBuilder(RISCV::C_NOP_HINT).addOperand(Inst.getOperand(2)));
return false;
+ }
case RISCV::PseudoLLAImm:
case RISCV::PseudoLAImm:
case RISCV::PseudoLI: {
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index fa7bcfa..5e54b82 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -193,21 +193,19 @@ static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, uint32_t RegNo,
static DecodeStatus DecodeGPRNoX0RegisterClass(MCInst &Inst, uint32_t RegNo,
uint64_t Address,
const MCDisassembler *Decoder) {
- if (RegNo == 0) {
+ if (RegNo == 0)
return MCDisassembler::Fail;
- }
return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
}
-static DecodeStatus
-DecodeGPRNoX0X2RegisterClass(MCInst &Inst, uint64_t RegNo, uint32_t Address,
- const MCDisassembler *Decoder) {
- if (RegNo == 2) {
+static DecodeStatus DecodeGPRNoX2RegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint32_t Address,
+ const MCDisassembler *Decoder) {
+ if (RegNo == 2)
return MCDisassembler::Fail;
- }
- return DecodeGPRNoX0RegisterClass(Inst, RegNo, Address, Decoder);
+ return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
}
static DecodeStatus DecodeGPRNoX31RegisterClass(MCInst &Inst, uint32_t RegNo,
@@ -536,31 +534,6 @@ static DecodeStatus decodeRTZArg(MCInst &Inst, uint32_t Imm, int64_t Address,
return MCDisassembler::Success;
}
-static DecodeStatus decodeRVCInstrRdRs1ImmZero(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus decodeRVCInstrRdSImm6(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus decodeRVCInstrRdCLUIImm(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus
-decodeRVCInstrRdRs1UImmLog2XLenNonZero(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus decodeRVCInstrRdRs2(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
-static DecodeStatus decodeRVCInstrRdRs1Rs2(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder);
-
static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
uint64_t Address,
const MCDisassembler *Decoder);
@@ -579,18 +552,6 @@ static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn,
#include "RISCVGenDisassemblerTables.inc"
-static DecodeStatus decodeRVCInstrRdRs1ImmZero(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- DecodeStatus S = MCDisassembler::Success;
- uint32_t Rd = fieldFromInstruction(Insn, 7, 5);
- if (!Check(S, DecodeGPRNoX0RegisterClass(Inst, Rd, Address, Decoder)))
- return MCDisassembler::Fail;
- Inst.addOperand(Inst.getOperand(0));
- Inst.addOperand(MCOperand::createImm(0));
- return S;
-}
-
static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn,
uint64_t Address,
const MCDisassembler *Decoder) {
@@ -601,66 +562,6 @@ static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn,
return MCDisassembler::Success;
}
-static DecodeStatus decodeRVCInstrRdSImm6(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- Inst.addOperand(MCOperand::createReg(RISCV::X0));
- uint32_t Imm =
- fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5);
- [[maybe_unused]] DecodeStatus Result =
- decodeSImmOperand<6>(Inst, Imm, Address, Decoder);
- assert(Result == MCDisassembler::Success && "Invalid immediate");
- return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeRVCInstrRdCLUIImm(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- Inst.addOperand(MCOperand::createReg(RISCV::X0));
- uint32_t Imm =
- fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5);
- return decodeCLUIImmOperand(Inst, Imm, Address, Decoder);
-}
-
-static DecodeStatus
-decodeRVCInstrRdRs1UImmLog2XLenNonZero(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- Inst.addOperand(MCOperand::createReg(RISCV::X0));
- Inst.addOperand(Inst.getOperand(0));
-
- uint32_t UImm6 =
- fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5);
- return decodeUImmLog2XLenNonZeroOperand(Inst, UImm6, Address, Decoder);
-}
-
-static DecodeStatus decodeRVCInstrRdRs2(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- DecodeStatus S = MCDisassembler::Success;
- uint32_t Rd = fieldFromInstruction(Insn, 7, 5);
- uint32_t Rs2 = fieldFromInstruction(Insn, 2, 5);
- if (!Check(S, DecodeGPRRegisterClass(Inst, Rd, Address, Decoder)))
- return MCDisassembler::Fail;
- if (!Check(S, DecodeGPRRegisterClass(Inst, Rs2, Address, Decoder)))
- return MCDisassembler::Fail;
- return S;
-}
-
-static DecodeStatus decodeRVCInstrRdRs1Rs2(MCInst &Inst, uint32_t Insn,
- uint64_t Address,
- const MCDisassembler *Decoder) {
- DecodeStatus S = MCDisassembler::Success;
- uint32_t Rd = fieldFromInstruction(Insn, 7, 5);
- uint32_t Rs2 = fieldFromInstruction(Insn, 2, 5);
- if (!Check(S, DecodeGPRRegisterClass(Inst, Rd, Address, Decoder)))
- return MCDisassembler::Fail;
- Inst.addOperand(Inst.getOperand(0));
- if (!Check(S, DecodeGPRRegisterClass(Inst, Rs2, Address, Decoder)))
- return MCDisassembler::Fail;
- return S;
-}
-
static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
uint64_t Address,
const MCDisassembler *Decoder) {
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 2c37c3b..82e3b5c 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -320,6 +320,7 @@ bool RISCVAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) {
MCFixup Fixup =
MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_RISCV_ALIGN);
F.setVarFixups({Fixup});
+ F.setLinkerRelaxable();
F.getParent()->setLinkerRelaxable();
return true;
}
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 7ad5d5f..bddea43 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -330,7 +330,6 @@ enum OperandType : unsigned {
OPERAND_UIMM32,
OPERAND_UIMM48,
OPERAND_UIMM64,
- OPERAND_ZERO,
OPERAND_THREE,
OPERAND_FOUR,
OPERAND_SIMM5,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
index baa508a..269b117 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
@@ -13,13 +13,7 @@
#include "MCTargetDesc/RISCVAsmBackend.h"
#include "MCTargetDesc/RISCVMCAsmInfo.h"
-#include "RISCVFixupKinds.h"
#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index d4f5d8f..2f32e2a 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -293,7 +293,7 @@ void RISCVAsmPrinter::emitNTLHint(const MachineInstr *MI) {
MCInst Hint;
if (STI->hasStdExtZca())
- Hint.setOpcode(RISCV::C_ADD_HINT);
+ Hint.setOpcode(RISCV::C_ADD);
else
Hint.setOpcode(RISCV::ADD);
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index f9c0b54..171940e 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1272,7 +1272,7 @@ def FeatureVendorXSfmm128t
def FeatureVendorXSfvqmaccdod
: RISCVExtension<1, 0,
"SiFive Int8 Matrix Multiplication Instructions (2-by-8 and 8-by-2)",
- [FeatureStdExtZve32x]>;
+ [FeatureStdExtZve32x, FeatureStdExtZvl128b]>;
def HasVendorXSfvqmaccdod
: Predicate<"Subtarget->hasVendorXSfvqmaccdod()">,
AssemblerPredicate<(all_of FeatureVendorXSfvqmaccdod),
@@ -1281,7 +1281,7 @@ def HasVendorXSfvqmaccdod
def FeatureVendorXSfvqmaccqoq
: RISCVExtension<1, 0,
"SiFive Int8 Matrix Multiplication Instructions (4-by-8 and 8-by-4)",
- [FeatureStdExtZve32x]>;
+ [FeatureStdExtZve32x, FeatureStdExtZvl256b]>;
def HasVendorXSfvqmaccqoq
: Predicate<"Subtarget->hasVendorXSfvqmaccqoq()">,
AssemblerPredicate<(all_of FeatureVendorXSfvqmaccqoq),
@@ -1290,7 +1290,7 @@ def HasVendorXSfvqmaccqoq
def FeatureVendorXSfvfwmaccqqq
: RISCVExtension<1, 0,
"SiFive Matrix Multiply Accumulate Instruction (4-by-4)",
- [FeatureStdExtZvfbfmin]>;
+ [FeatureStdExtZvfbfmin, FeatureStdExtZvl128b]>;
def HasVendorXSfvfwmaccqqq
: Predicate<"Subtarget->hasVendorXSfvfwmaccqqq()">,
AssemblerPredicate<(all_of FeatureVendorXSfvfwmaccqqq),
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 3918dd2..54845e5 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1618,6 +1618,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
}
}
+ // Customize load and store operation for bf16 if zfh isn't enabled.
+ if (Subtarget.hasVendorXAndesBFHCvt() && !Subtarget.hasStdExtZfh()) {
+ setOperationAction(ISD::LOAD, MVT::bf16, Custom);
+ setOperationAction(ISD::STORE, MVT::bf16, Custom);
+ }
+
// Function alignments.
const Align FunctionAlignment(Subtarget.hasStdExtZca() ? 2 : 4);
setMinFunctionAlignment(FunctionAlignment);
@@ -7216,6 +7222,47 @@ static SDValue SplitStrictFPVectorOp(SDValue Op, SelectionDAG &DAG) {
return DAG.getMergeValues({V, HiRes.getValue(1)}, DL);
}
+SDValue
+RISCVTargetLowering::lowerXAndesBfHCvtBFloat16Load(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Subtarget.hasVendorXAndesBFHCvt() && !Subtarget.hasStdExtZfh() &&
+ "Unexpected bfloat16 load lowering");
+
+ SDLoc DL(Op);
+ LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
+ EVT MemVT = LD->getMemoryVT();
+ SDValue Load = DAG.getExtLoad(
+ ISD::ZEXTLOAD, DL, Subtarget.getXLenVT(), LD->getChain(),
+ LD->getBasePtr(),
+ EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
+ LD->getMemOperand());
+ // Using mask to make bf16 nan-boxing valid when we don't have flh
+ // instruction. -65536 would be treat as a small number and thus it can be
+ // directly used lui to get the constant.
+ SDValue mask = DAG.getSignedConstant(-65536, DL, Subtarget.getXLenVT());
+ SDValue OrSixteenOne =
+ DAG.getNode(ISD::OR, DL, Load.getValueType(), {Load, mask});
+ SDValue ConvertedResult =
+ DAG.getNode(RISCVISD::NDS_FMV_BF16_X, DL, MVT::bf16, OrSixteenOne);
+ return DAG.getMergeValues({ConvertedResult, Load.getValue(1)}, DL);
+}
+
+SDValue
+RISCVTargetLowering::lowerXAndesBfHCvtBFloat16Store(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Subtarget.hasVendorXAndesBFHCvt() && !Subtarget.hasStdExtZfh() &&
+ "Unexpected bfloat16 store lowering");
+
+ StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
+ SDLoc DL(Op);
+ SDValue FMV = DAG.getNode(RISCVISD::NDS_FMV_X_ANYEXTBF16, DL,
+ Subtarget.getXLenVT(), ST->getValue());
+ return DAG.getTruncStore(
+ ST->getChain(), DL, FMV, ST->getBasePtr(),
+ EVT::getIntegerVT(*DAG.getContext(), ST->getMemoryVT().getSizeInBits()),
+ ST->getMemOperand());
+}
+
SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
@@ -7914,6 +7961,9 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
return DAG.getMergeValues({Pair, Chain}, DL);
}
+ if (VT == MVT::bf16)
+ return lowerXAndesBfHCvtBFloat16Load(Op, DAG);
+
// Handle normal vector tuple load.
if (VT.isRISCVVectorTuple()) {
SDLoc DL(Op);
@@ -7998,6 +8048,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
{Store->getChain(), Lo, Hi, Store->getBasePtr()}, MVT::i64,
Store->getMemOperand());
}
+
+ if (VT == MVT::bf16)
+ return lowerXAndesBfHCvtBFloat16Store(Op, DAG);
+
// Handle normal vector tuple store.
if (VT.isRISCVVectorTuple()) {
SDLoc DL(Op);
@@ -16079,7 +16133,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
uint64_t MulAmt = CNode->getZExtValue();
// Don't do this if the Xqciac extension is enabled and the MulAmt in simm12.
- if (Subtarget.hasVendorXqciac() && isInt<12>(MulAmt))
+ if (Subtarget.hasVendorXqciac() && isInt<12>(CNode->getSExtValue()))
return SDValue();
const bool HasShlAdd = Subtarget.hasStdExtZba() ||
@@ -16184,10 +16238,12 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
// 2^N - 3/5/9 --> (sub (shl X, C1), (shXadd X, x))
for (uint64_t Offset : {3, 5, 9}) {
if (isPowerOf2_64(MulAmt + Offset)) {
+ unsigned ShAmt = Log2_64(MulAmt + Offset);
+ if (ShAmt >= VT.getSizeInBits())
+ continue;
SDLoc DL(N);
SDValue Shift1 =
- DAG.getNode(ISD::SHL, DL, VT, X,
- DAG.getConstant(Log2_64(MulAmt + Offset), DL, VT));
+ DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShAmt, DL, VT));
SDValue Mul359 =
DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
DAG.getConstant(Log2_64(Offset - 1), DL, VT), X);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index f0447e0..ca70c46 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -578,6 +578,9 @@ private:
SDValue lowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerPARTIAL_REDUCE_MLA(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerXAndesBfHCvtBFloat16Load(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerXAndesBfHCvtBFloat16Store(SDValue Op, SelectionDAG &DAG) const;
+
bool isEligibleForTailCallOptimization(
CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF,
const SmallVector<CCValAssign, 16> &ArgLocs) const;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 64f9e3e..085064e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2859,9 +2859,6 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
case RISCVOp::OPERAND_UIMM16_NONZERO:
Ok = isUInt<16>(Imm) && (Imm != 0);
break;
- case RISCVOp::OPERAND_ZERO:
- Ok = Imm == 0;
- break;
case RISCVOp::OPERAND_THREE:
Ok = Imm == 3;
break;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
index 8252a9b..c5551fb 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -57,12 +57,6 @@ def simm6nonzero : RISCVOp,
}];
}
-def immzero : RISCVOp,
- ImmLeaf<XLenVT, [{return (Imm == 0);}]> {
- let ParserMatchClass = ImmZeroAsmOperand;
- let OperandType = "OPERAND_ZERO";
-}
-
def CLUIImmAsmOperand : AsmOperandClass {
let Name = "CLUIImm";
let RenderMethod = "addImmOperands";
@@ -272,7 +266,7 @@ class Bcz<bits<3> funct3, string OpcodeStr>
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
class Shift_right<bits<2> funct2, string OpcodeStr>
: RVInst16CB<0b100, 0b01, (outs GPRC:$rd),
- (ins GPRC:$rs1, uimmlog2xlennonzero:$imm),
+ (ins GPRC:$rs1, uimmlog2xlen:$imm),
OpcodeStr, "$rs1, $imm"> {
let Constraints = "$rs1 = $rd";
let Inst{12} = imm{5};
@@ -402,17 +396,19 @@ def C_NOP : RVInst16CI<0b000, 0b01, (outs), (ins), "c.nop", "">,
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
def C_ADDI : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb),
- (ins GPRNoX0:$rd, simm6nonzero:$imm),
+ (ins GPRNoX0:$rd, simm6:$imm),
"c.addi", "$rd, $imm">,
Sched<[WriteIALU, ReadIALU]> {
let Constraints = "$rd = $rd_wb";
}
-// Alternate syntax for c.nop. Converted to C_NOP by the assembler.
+// Alternate syntax for c.nop. Converted to C_NOP/C_NOP_HINT by the assembler.
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 0,
isAsmParserOnly = 1 in
-def PseudoC_ADDI_NOP : Pseudo<(outs GPRX0:$rd), (ins GPRX0:$rs1, immzero:$imm),
- [], "c.addi", "$rd, $imm">;
+def PseudoC_ADDI_NOP : Pseudo<(outs GPRX0:$rd), (ins GPRX0:$rs1, simm6:$imm),
+ [], "c.addi", "$rd, $imm"> {
+ let Constraints = "$rs1 = $rd";
+}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCall = 1,
DecoderNamespace = "RV32Only", Defs = [X1],
@@ -430,7 +426,7 @@ def C_ADDIW : RVInst16CI<0b001, 0b01, (outs GPRNoX0:$rd_wb),
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-def C_LI : RVInst16CI<0b010, 0b01, (outs GPRNoX0:$rd), (ins simm6:$imm),
+def C_LI : RVInst16CI<0b010, 0b01, (outs GPR:$rd), (ins simm6:$imm),
"c.li", "$rd, $imm">,
Sched<[WriteIALU]>;
@@ -449,7 +445,7 @@ def C_ADDI16SP : RVInst16CI<0b011, 0b01, (outs SP:$rd_wb),
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-def C_LUI : RVInst16CI<0b011, 0b01, (outs GPRNoX0X2:$rd),
+def C_LUI : RVInst16CI<0b011, 0b01, (outs GPRNoX2:$rd),
(ins c_lui_imm:$imm),
"c.lui", "$rd, $imm">,
Sched<[WriteIALU]>;
@@ -497,8 +493,8 @@ def C_BEQZ : Bcz<0b110, "c.beqz">, Sched<[WriteJmp, ReadJmp]>;
def C_BNEZ : Bcz<0b111, "c.bnez">, Sched<[WriteJmp, ReadJmp]>;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-def C_SLLI : RVInst16CI<0b000, 0b10, (outs GPRNoX0:$rd_wb),
- (ins GPRNoX0:$rd, uimmlog2xlennonzero:$imm),
+def C_SLLI : RVInst16CI<0b000, 0b10, (outs GPR:$rd_wb),
+ (ins GPR:$rd, uimmlog2xlen:$imm),
"c.slli", "$rd, $imm">,
Sched<[WriteShiftImm, ReadShiftImm]> {
let Constraints = "$rd = $rd_wb";
@@ -544,7 +540,7 @@ def C_JR : RVInst16CR<0b1000, 0b10, (outs), (ins GPRNoX0:$rs1),
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isMoveReg = 1,
isAsCheapAsAMove = 1 in
-def C_MV : RVInst16CR<0b1000, 0b10, (outs GPRNoX0:$rs1), (ins GPRNoX0:$rs2),
+def C_MV : RVInst16CR<0b1000, 0b10, (outs GPR:$rs1), (ins GPRNoX0:$rs2),
"c.mv", "$rs1, $rs2">,
Sched<[WriteIALU, ReadIALU]>;
@@ -557,8 +553,8 @@ def C_JALR : RVInst16CR<0b1001, 0b10, (outs), (ins GPRNoX0:$rs1),
"c.jalr", "$rs1">, Sched<[WriteJalr, ReadJalr]>;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-def C_ADD : RVInst16CR<0b1001, 0b10, (outs GPRNoX0:$rd),
- (ins GPRNoX0:$rs1, GPRNoX0:$rs2),
+def C_ADD : RVInst16CR<0b1001, 0b10, (outs GPR:$rd),
+ (ins GPR:$rs1, GPRNoX0:$rs2),
"c.add", "$rs1, $rs2">,
Sched<[WriteIALU, ReadIALU, ReadIALU]> {
let Constraints = "$rs1 = $rd";
@@ -616,81 +612,6 @@ def C_NOP_HINT : RVInst16CI<0b000, 0b01, (outs), (ins simm6nonzero:$imm),
let rd = 0;
}
-def C_ADDI_HINT_IMM_ZERO : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb),
- (ins GPRNoX0:$rd, immzero:$imm),
- "c.addi", "$rd, $imm">,
- Sched<[WriteIALU, ReadIALU]> {
- let Constraints = "$rd = $rd_wb";
- let imm = 0;
- let DecoderMethod = "decodeRVCInstrRdRs1ImmZero";
-}
-
-def C_LI_HINT : RVInst16CI<0b010, 0b01, (outs GPRX0:$rd), (ins simm6:$imm),
- "c.li", "$rd, $imm">,
- Sched<[WriteIALU]> {
- let Inst{11-7} = 0;
- let DecoderMethod = "decodeRVCInstrRdSImm6";
-}
-
-def C_LUI_HINT : RVInst16CI<0b011, 0b01, (outs GPRX0:$rd),
- (ins c_lui_imm:$imm),
- "c.lui", "$rd, $imm">,
- Sched<[WriteIALU]> {
- let Inst{11-7} = 0;
- let DecoderMethod = "decodeRVCInstrRdCLUIImm";
-}
-
-def C_MV_HINT : RVInst16CR<0b1000, 0b10, (outs GPRX0:$rs1), (ins GPRNoX0:$rs2),
- "c.mv", "$rs1, $rs2">, Sched<[WriteIALU, ReadIALU]> {
- let Inst{11-7} = 0;
- let DecoderMethod = "decodeRVCInstrRdRs2";
-}
-
-def C_ADD_HINT : RVInst16CR<0b1001, 0b10, (outs GPRX0:$rd),
- (ins GPRX0:$rs1, GPRNoX0:$rs2),
- "c.add", "$rs1, $rs2">,
- Sched<[WriteIALU, ReadIALU, ReadIALU]> {
- let Constraints = "$rs1 = $rd";
- let Inst{11-7} = 0;
- let DecoderMethod = "decodeRVCInstrRdRs1Rs2";
-}
-
-def C_SLLI_HINT : RVInst16CI<0b000, 0b10, (outs GPRX0:$rd_wb),
- (ins GPRX0:$rd, uimmlog2xlennonzero:$imm),
- "c.slli", "$rd, $imm">,
- Sched<[WriteShiftImm, ReadShiftImm]> {
- let Constraints = "$rd = $rd_wb";
- let Inst{11-7} = 0;
- let DecoderMethod = "decodeRVCInstrRdRs1UImmLog2XLenNonZero";
-}
-
-def C_SLLI64_HINT : RVInst16CI<0b000, 0b10, (outs GPR:$rd_wb), (ins GPR:$rd),
- "c.slli64", "$rd">,
- Sched<[WriteShiftImm, ReadShiftImm]> {
- let Constraints = "$rd = $rd_wb";
- let imm = 0;
-}
-
-def C_SRLI64_HINT : RVInst16CB<0b100, 0b01, (outs GPRC:$rd),
- (ins GPRC:$rs1),
- "c.srli64", "$rs1">,
- Sched<[WriteShiftImm, ReadShiftImm]> {
- let Constraints = "$rs1 = $rd";
- let Inst{6-2} = 0;
- let Inst{11-10} = 0b00;
- let Inst{12} = 0;
-}
-
-def C_SRAI64_HINT : RVInst16CB<0b100, 0b01, (outs GPRC:$rd),
- (ins GPRC:$rs1),
- "c.srai64", "$rs1">,
- Sched<[WriteShiftImm, ReadShiftImm]> {
- let Constraints = "$rs1 = $rd";
- let Inst{6-2} = 0;
- let Inst{11-10} = 0b01;
- let Inst{12} = 0;
-}
-
} // Predicates = [HasStdExtZca], hasSideEffects = 0, mayLoad = 0,
// mayStore = 0
@@ -699,15 +620,17 @@ def C_SRAI64_HINT : RVInst16CB<0b100, 0b01, (outs GPRC:$rd),
//===----------------------------------------------------------------------===//
let Predicates = [HasStdExtZca] in {
-// Just a different syntax for the c.nop hint: c.addi x0, simm6 vs c.nop simm6.
-def : InstAlias<"c.addi x0, $imm", (C_NOP_HINT simm6nonzero:$imm), 0>;
+// Legacy aliases.
+def : InstAlias<"c.slli64 $rd", (C_SLLI GPR:$rd, 0), 0>;
+def : InstAlias<"c.srli64 $rs1", (C_SRLI GPRC:$rs1, 0), 0>;
+def : InstAlias<"c.srai64 $rs1", (C_SRAI GPRC:$rs1, 0), 0>;
}
let Predicates = [HasStdExtC, HasStdExtZihintntl] in {
-def : InstAlias<"c.ntl.p1", (C_ADD_HINT X0, X2)>;
-def : InstAlias<"c.ntl.pall", (C_ADD_HINT X0, X3)>;
-def : InstAlias<"c.ntl.s1", (C_ADD_HINT X0, X4)>;
-def : InstAlias<"c.ntl.all", (C_ADD_HINT X0, X5)>;
+def : InstAlias<"c.ntl.p1", (C_ADD X0, X2)>;
+def : InstAlias<"c.ntl.pall", (C_ADD X0, X3)>;
+def : InstAlias<"c.ntl.s1", (C_ADD X0, X4)>;
+def : InstAlias<"c.ntl.all", (C_ADD X0, X5)>;
} // Predicates = [HasStdExtC, HasStdExtZihintntl]
let EmitPriority = 0 in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index 7f1077a..dd365cf 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -89,39 +89,41 @@ class PLI_B_i<bits<8> funct8, string opcodestr>
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVPUnary<bits<3> f, string opcodestr, dag operands, string argstr>
- : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), operands, opcodestr, argstr> {
+class RVPShift_ri<bits<3> f, bits<3> funct3, string opcodestr, Operand ImmType>
+ : RVInstIBase<funct3, OPC_OP_IMM_32, (outs GPR:$rd),
+ (ins GPR:$rs1, ImmType:$shamt), opcodestr,
+ "$rd, $rs1, $shamt"> {
let Inst{31} = 0b1;
let Inst{30-28} = f;
let Inst{27} = 0b0;
}
-class RVPUnaryImm5<bits<3> f, string opcodestr>
- : RVPUnary<f, opcodestr, (ins GPR:$rs1, uimm5:$uimm5), "$rd, $rs1, $uimm5"> {
- bits<5> uimm5;
+class RVPShiftW_ri<bits<3> f, bits<3> funct3, string opcodestr>
+ : RVPShift_ri<f, funct3, opcodestr, uimm5> {
+ bits<5> shamt;
let Inst{26-25} = 0b01;
- let Inst{24-20} = uimm5;
+ let Inst{24-20} = shamt;
}
-class RVPUnaryImm4<bits<3> f, string opcodestr>
- : RVPUnary<f, opcodestr, (ins GPR:$rs1, uimm4:$uimm4), "$rd, $rs1, $uimm4"> {
- bits<4> uimm4;
+class RVPShiftH_ri<bits<3> f, bits<3> funct3, string opcodestr>
+ : RVPShift_ri<f, funct3, opcodestr, uimm4> {
+ bits<4> shamt;
let Inst{26-24} = 0b001;
- let Inst{23-20} = uimm4;
+ let Inst{23-20} = shamt;
}
-class RVPUnaryImm3<bits<3> f, string opcodestr>
- : RVPUnary<f, opcodestr, (ins GPR:$rs1, uimm3:$uimm3), "$rd, $rs1, $uimm3"> {
- bits<3> uimm3;
+class RVPShiftB_ri<bits<3> f, bits<3> funct3, string opcodestr>
+ : RVPShift_ri<f, funct3, opcodestr, uimm3> {
+ bits<3> shamt;
let Inst{26-23} = 0b0001;
- let Inst{22-20} = uimm3;
+ let Inst{22-20} = shamt;
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVPUnaryWUF<bits<2> w, bits<5> uf, string opcodestr>
+class RVPUnary_ri<bits<2> w, bits<5> uf, string opcodestr>
: RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), (ins GPR:$rs1),
opcodestr, "$rd, $rs1"> {
let Inst{31-27} = 0b11100;
@@ -149,16 +151,16 @@ def ABSW : UnaryW_r<0b011000000111, 0b001, "absw">;
} // Predicates = [HasStdExtP, IsRV64]
let Predicates = [HasStdExtP] in {
-def PSLLI_B : RVPUnaryImm3<0b000, "pslli.b">;
-def PSLLI_H : RVPUnaryImm4<0b000, "pslli.h">;
-def PSSLAI_H : RVPUnaryImm4<0b101, "psslai.h">;
+def PSLLI_B : RVPShiftB_ri<0b000, 0b010, "pslli.b">;
+def PSLLI_H : RVPShiftH_ri<0b000, 0b010, "pslli.h">;
+def PSSLAI_H : RVPShiftH_ri<0b101, 0b010, "psslai.h">;
} // Predicates = [HasStdExtP]
let DecoderNamespace = "RV32Only",
Predicates = [HasStdExtP, IsRV32] in
-def SSLAI : RVPUnaryImm5<0b101, "sslai">;
+def SSLAI : RVPShiftW_ri<0b101, 0b010, "sslai">;
let Predicates = [HasStdExtP, IsRV64] in {
-def PSLLI_W : RVPUnaryImm5<0b000, "pslli.w">;
-def PSSLAI_W : RVPUnaryImm5<0b101, "psslai.w">;
+def PSLLI_W : RVPShiftW_ri<0b000, 0b010, "pslli.w">;
+def PSSLAI_W : RVPShiftW_ri<0b101, 0b010, "psslai.w">;
} // Predicates = [HasStdExtP, IsRV64]
let Predicates = [HasStdExtP] in
@@ -169,13 +171,13 @@ let Predicates = [HasStdExtP] in
def PLI_B : PLI_B_i<0b10110100, "pli.b">;
let Predicates = [HasStdExtP] in {
-def PSEXT_H_B : RVPUnaryWUF<0b00, 0b00100, "psext.h.b">;
-def PSABS_H : RVPUnaryWUF<0b00, 0b00111, "psabs.h">;
-def PSABS_B : RVPUnaryWUF<0b10, 0b00111, "psabs.b">;
+def PSEXT_H_B : RVPUnary_ri<0b00, 0b00100, "psext.h.b">;
+def PSABS_H : RVPUnary_ri<0b00, 0b00111, "psabs.h">;
+def PSABS_B : RVPUnary_ri<0b10, 0b00111, "psabs.b">;
} // Predicates = [HasStdExtP]
let Predicates = [HasStdExtP, IsRV64] in {
-def PSEXT_W_B : RVPUnaryWUF<0b01, 0b00100, "psext.w.b">;
-def PSEXT_W_H : RVPUnaryWUF<0b01, 0b00101, "psext.w.h">;
+def PSEXT_W_B : RVPUnary_ri<0b01, 0b00100, "psext.w.b">;
+def PSEXT_W_H : RVPUnary_ri<0b01, 0b00101, "psext.w.h">;
} // Predicates = [HasStdExtP, IsRV64]
let Predicates = [HasStdExtP] in
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index dfa532a..6afc942d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -788,7 +788,7 @@ class VPseudoUSLoadNoMask<VReg RetClass,
DAGOperand sewop = sew> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$vl,
- sewop:$sew, vec_policy:$policy), []>,
+ sewop:$sew, vec_policy:$policy)>,
RISCVVLE</*Masked*/0, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -804,7 +804,7 @@ class VPseudoUSLoadMask<VReg RetClass,
RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$passthru,
GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$vl, sew:$sew,
- vec_policy:$policy), []>,
+ vec_policy:$policy)>,
RISCVVLE</*Masked*/1, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -821,7 +821,7 @@ class VPseudoUSLoadFFNoMask<VReg RetClass,
int EEW> :
RISCVVPseudo<(outs RetClass:$rd, GPR:$vl),
(ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$avl,
- sew:$sew, vec_policy:$policy), []>,
+ sew:$sew, vec_policy:$policy)>,
RISCVVLE</*Masked*/0, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -837,7 +837,7 @@ class VPseudoUSLoadFFMask<VReg RetClass,
RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl),
(ins GetVRegNoV0<RetClass>.R:$passthru,
GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$avl, sew:$sew,
- vec_policy:$policy), []>,
+ vec_policy:$policy)>,
RISCVVLE</*Masked*/1, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -854,7 +854,7 @@ class VPseudoSLoadNoMask<VReg RetClass,
int EEW> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$dest, GPRMemZeroOffset:$rs1, GPR:$rs2,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
+ AVL:$vl, sew:$sew, vec_policy:$policy)>,
RISCVVLE</*Masked*/0, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -870,7 +870,7 @@ class VPseudoSLoadMask<VReg RetClass,
RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$passthru,
GPRMemZeroOffset:$rs1, GPR:$rs2, VMaskOp:$vm, AVL:$vl,
- sew:$sew, vec_policy:$policy), []>,
+ sew:$sew, vec_policy:$policy)>,
RISCVVLE</*Masked*/1, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -892,7 +892,7 @@ class VPseudoILoadNoMask<VReg RetClass,
bits<2> TargetConstraintType = 1> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$dest, GPRMemZeroOffset:$rs1, IdxClass:$rs2,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
+ AVL:$vl, sew:$sew, vec_policy:$policy)>,
RISCVVLX</*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 1;
let mayStore = 0;
@@ -914,7 +914,7 @@ class VPseudoILoadMask<VReg RetClass,
RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$passthru,
GPRMemZeroOffset:$rs1, IdxClass:$rs2, VMaskOp:$vm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
+ AVL:$vl, sew:$sew, vec_policy:$policy)>,
RISCVVLX</*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 1;
let mayStore = 0;
@@ -933,7 +933,7 @@ class VPseudoUSStoreNoMask<VReg StClass,
DAGOperand sewop = sew> :
RISCVVPseudo<(outs),
(ins StClass:$rd, GPRMemZeroOffset:$rs1, AVL:$vl,
- sewop:$sew), []>,
+ sewop:$sew)>,
RISCVVSE</*Masked*/0, /*Strided*/0, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -946,7 +946,7 @@ class VPseudoUSStoreMask<VReg StClass,
int EEW> :
RISCVVPseudo<(outs),
(ins StClass:$rd, GPRMemZeroOffset:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
+ VMaskOp:$vm, AVL:$vl, sew:$sew)>,
RISCVVSE</*Masked*/1, /*Strided*/0, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -960,7 +960,7 @@ class VPseudoSStoreNoMask<VReg StClass,
int EEW> :
RISCVVPseudo<(outs),
(ins StClass:$rd, GPRMemZeroOffset:$rs1, GPR:$rs2,
- AVL:$vl, sew:$sew), []>,
+ AVL:$vl, sew:$sew)>,
RISCVVSE</*Masked*/0, /*Strided*/1, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -973,7 +973,7 @@ class VPseudoSStoreMask<VReg StClass,
int EEW> :
RISCVVPseudo<(outs),
(ins StClass:$rd, GPRMemZeroOffset:$rs1, GPR:$rs2,
- VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
+ VMaskOp:$vm, AVL:$vl, sew:$sew)>,
RISCVVSE</*Masked*/1, /*Strided*/1, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -986,7 +986,7 @@ class VPseudoSStoreMask<VReg StClass,
class VPseudoNullaryNoMask<VReg RegClass> :
RISCVVPseudo<(outs RegClass:$rd),
(ins RegClass:$passthru,
- AVL:$vl, sew:$sew, vec_policy:$policy), []> {
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1015,7 +1015,7 @@ class VPseudoNullaryMask<VReg RegClass> :
// Nullary for pseudo instructions. They are expanded in
// RISCVExpandPseudoInsts pass.
class VPseudoNullaryPseudoM<string BaseInst> :
- RISCVVPseudo<(outs VR:$rd), (ins AVL:$vl, sew_mask:$sew), []> {
+ RISCVVPseudo<(outs VR:$rd), (ins AVL:$vl, sew_mask:$sew)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1031,7 +1031,7 @@ class VPseudoUnaryNoMask<DAGOperand RetClass,
bits<2> TargetConstraintType = 1> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$passthru, OpClass:$rs2,
- AVL:$vl, sew:$sew, vec_policy:$policy), []> {
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1047,7 +1047,7 @@ class VPseudoUnaryNoMaskNoPolicy<DAGOperand RetClass,
string Constraint = "",
bits<2> TargetConstraintType = 1> :
RISCVVPseudo<(outs RetClass:$rd),
- (ins OpClass:$rs2, AVL:$vl, sew_mask:$sew), []> {
+ (ins OpClass:$rs2, AVL:$vl, sew_mask:$sew)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1063,7 +1063,7 @@ class VPseudoUnaryNoMaskRoundingMode<DAGOperand RetClass,
bits<2> TargetConstraintType = 1> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$passthru, OpClass:$rs2, vec_rm:$rm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []> {
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1084,7 +1084,7 @@ class VPseudoUnaryMask<VReg RetClass,
DAGOperand sewop = sew> :
RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2,
- VMaskOp:$vm, AVL:$vl, sewop:$sew, vec_policy:$policy), []> {
+ VMaskOp:$vm, AVL:$vl, sewop:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1104,7 +1104,7 @@ class VPseudoUnaryMaskRoundingMode<VReg RetClass,
RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2,
VMaskOp:$vm, vec_rm:$rm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []> {
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1139,7 +1139,7 @@ class VPseudoUnaryMask_NoExcept<VReg RetClass,
class VPseudoUnaryNoMaskGPROut :
RISCVVPseudo<(outs GPR:$rd),
- (ins VR:$rs2, AVL:$vl, sew_mask:$sew), []> {
+ (ins VR:$rs2, AVL:$vl, sew_mask:$sew)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1149,7 +1149,7 @@ class VPseudoUnaryNoMaskGPROut :
class VPseudoUnaryMaskGPROut :
RISCVVPseudo<(outs GPR:$rd),
- (ins VR:$rs1, VMaskOp:$vm, AVL:$vl, sew_mask:$sew), []> {
+ (ins VR:$rs1, VMaskOp:$vm, AVL:$vl, sew_mask:$sew)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1163,7 +1163,7 @@ class VPseudoUnaryAnyMask<VReg RetClass,
VReg Op1Class> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$passthru, Op1Class:$rs2,
- VR:$vm, AVL:$vl, sew:$sew), []> {
+ VR:$vm, AVL:$vl, sew:$sew)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1197,7 +1197,7 @@ class VPseudoBinaryNoMaskPolicy<VReg RetClass,
bits<2> TargetConstraintType = 1> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1,
- AVL:$vl, sew:$sew, vec_policy:$policy), []> {
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1241,7 +1241,7 @@ class VPseudoBinaryMaskPolicyRoundingMode<VReg RetClass,
(ins GetVRegNoV0<RetClass>.R:$passthru,
Op1Class:$rs2, Op2Class:$rs1,
VMaskOp:$vm, vec_rm:$rm, AVL:$vl,
- sew:$sew, vec_policy:$policy), []> {
+ sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1266,7 +1266,7 @@ class VPseudoTiedBinaryNoMask<VReg RetClass,
bits<2> TargetConstraintType = 1> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$rs2, Op2Class:$rs1, AVL:$vl, sew:$sew,
- vec_policy:$policy), []> {
+ vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1288,7 +1288,7 @@ class VPseudoTiedBinaryNoMaskRoundingMode<VReg RetClass,
(ins RetClass:$rs2, Op2Class:$rs1,
vec_rm:$rm,
AVL:$vl, sew:$sew,
- vec_policy:$policy), []> {
+ vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1380,7 +1380,7 @@ class VPseudoTernaryMaskPolicyRoundingMode<VReg RetClass,
Op1Class:$rs2, Op2Class:$rs1,
VMaskOp:$vm,
vec_rm:$rm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []> {
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1451,7 +1451,7 @@ class VPseudoTiedBinaryMaskRoundingMode<VReg RetClass,
Op2Class:$rs1,
VMaskOp:$vm,
vec_rm:$rm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []> {
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1480,7 +1480,7 @@ class VPseudoBinaryCarry<VReg RetClass,
(ins Op1Class:$rs2, Op2Class:$rs1,
VMV0:$carry, AVL:$vl, sew:$sew),
(ins Op1Class:$rs2, Op2Class:$rs1,
- AVL:$vl, sew:$sew)), []> {
+ AVL:$vl, sew:$sew))> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1498,7 +1498,7 @@ class VPseudoTiedBinaryCarryIn<VReg RetClass,
bits<2> TargetConstraintType = 1> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1,
- VMV0:$carry, AVL:$vl, sew:$sew), []> {
+ VMV0:$carry, AVL:$vl, sew:$sew)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1516,7 +1516,7 @@ class VPseudoTernaryNoMask<VReg RetClass,
string Constraint> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
- AVL:$vl, sew:$sew), []> {
+ AVL:$vl, sew:$sew)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1532,7 +1532,7 @@ class VPseudoTernaryNoMaskWithPolicy<VReg RetClass,
bits<2> TargetConstraintType = 1> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
- AVL:$vl, sew:$sew, vec_policy:$policy), []> {
+ AVL:$vl, sew:$sew, vec_policy:$policy)> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1570,7 +1570,7 @@ class VPseudoUSSegLoadNoMask<VReg RetClass,
bits<4> NF> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$vl,
- sew:$sew, vec_policy:$policy), []>,
+ sew:$sew, vec_policy:$policy)>,
RISCVVLSEG<NF, /*Masked*/0, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1587,7 +1587,7 @@ class VPseudoUSSegLoadMask<VReg RetClass,
RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$passthru,
GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$vl, sew:$sew,
- vec_policy:$policy), []>,
+ vec_policy:$policy)>,
RISCVVLSEG<NF, /*Masked*/1, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1605,7 +1605,7 @@ class VPseudoUSSegLoadFFNoMask<VReg RetClass,
bits<4> NF> :
RISCVVPseudo<(outs RetClass:$rd, GPR:$vl),
(ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$avl,
- sew:$sew, vec_policy:$policy), []>,
+ sew:$sew, vec_policy:$policy)>,
RISCVVLSEG<NF, /*Masked*/0, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1622,7 +1622,7 @@ class VPseudoUSSegLoadFFMask<VReg RetClass,
RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl),
(ins GetVRegNoV0<RetClass>.R:$passthru,
GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$avl, sew:$sew,
- vec_policy:$policy), []>,
+ vec_policy:$policy)>,
RISCVVLSEG<NF, /*Masked*/1, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1640,7 +1640,7 @@ class VPseudoSSegLoadNoMask<VReg RetClass,
bits<4> NF> :
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$passthru, GPRMemZeroOffset:$rs1, GPR:$offset,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
+ AVL:$vl, sew:$sew, vec_policy:$policy)>,
RISCVVLSEG<NF, /*Masked*/0, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1657,7 +1657,7 @@ class VPseudoSSegLoadMask<VReg RetClass,
RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$passthru,
GPRMemZeroOffset:$rs1, GPR:$offset, VMaskOp:$vm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
+ AVL:$vl, sew:$sew, vec_policy:$policy)>,
RISCVVLSEG<NF, /*Masked*/1, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1679,7 +1679,7 @@ class VPseudoISegLoadNoMask<VReg RetClass,
RISCVVPseudo<(outs RetClass:$rd),
(ins RetClass:$passthru, GPRMemZeroOffset:$rs1,
IdxClass:$offset, AVL:$vl, sew:$sew,
- vec_policy:$policy), []>,
+ vec_policy:$policy)>,
RISCVVLXSEG<NF, /*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 1;
let mayStore = 0;
@@ -1701,7 +1701,7 @@ class VPseudoISegLoadMask<VReg RetClass,
RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
(ins GetVRegNoV0<RetClass>.R:$passthru,
GPRMemZeroOffset:$rs1, IdxClass:$offset, VMaskOp:$vm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
+ AVL:$vl, sew:$sew, vec_policy:$policy)>,
RISCVVLXSEG<NF, /*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 1;
let mayStore = 0;
@@ -1735,7 +1735,7 @@ class VPseudoUSSegStoreMask<VReg ValClass,
bits<4> NF> :
RISCVVPseudo<(outs),
(ins ValClass:$rd, GPRMemZeroOffset:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
+ VMaskOp:$vm, AVL:$vl, sew:$sew)>,
RISCVVSSEG<NF, /*Masked*/1, /*Strided*/0, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -1750,7 +1750,7 @@ class VPseudoSSegStoreNoMask<VReg ValClass,
bits<4> NF> :
RISCVVPseudo<(outs),
(ins ValClass:$rd, GPRMemZeroOffset:$rs1, GPR:$offset,
- AVL:$vl, sew:$sew), []>,
+ AVL:$vl, sew:$sew)>,
RISCVVSSEG<NF, /*Masked*/0, /*Strided*/1, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -1764,7 +1764,7 @@ class VPseudoSSegStoreMask<VReg ValClass,
bits<4> NF> :
RISCVVPseudo<(outs),
(ins ValClass:$rd, GPRMemZeroOffset:$rs1, GPR: $offset,
- VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
+ VMaskOp:$vm, AVL:$vl, sew:$sew)>,
RISCVVSSEG<NF, /*Masked*/1, /*Strided*/1, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -1782,7 +1782,7 @@ class VPseudoISegStoreNoMask<VReg ValClass,
bit Ordered> :
RISCVVPseudo<(outs),
(ins ValClass:$rd, GPRMemZeroOffset:$rs1, IdxClass: $index,
- AVL:$vl, sew:$sew), []>,
+ AVL:$vl, sew:$sew)>,
RISCVVSXSEG<NF, /*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 0;
let mayStore = 1;
@@ -1799,7 +1799,7 @@ class VPseudoISegStoreMask<VReg ValClass,
bit Ordered> :
RISCVVPseudo<(outs),
(ins ValClass:$rd, GPRMemZeroOffset:$rs1, IdxClass: $index,
- VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
+ VMaskOp:$vm, AVL:$vl, sew:$sew)>,
RISCVVSXSEG<NF, /*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 0;
let mayStore = 1;
@@ -6703,7 +6703,7 @@ let Predicates = [HasVInstructions] in {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
let HasSEWOp = 1, BaseInstr = VMV_X_S in
def PseudoVMV_X_S:
- RISCVVPseudo<(outs GPR:$rd), (ins VR:$rs2, sew:$sew), []>,
+ RISCVVPseudo<(outs GPR:$rd), (ins VR:$rs2, sew:$sew)>,
Sched<[WriteVMovXS, ReadVMovXS]>;
let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X, isReMaterializable = 1,
Constraints = "$rd = $passthru" in
@@ -6723,8 +6723,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
foreach f = FPList in {
let HasSEWOp = 1, BaseInstr = VFMV_F_S in
def "PseudoVFMV_" # f.FX # "_S" :
- RISCVVPseudo<(outs f.fprclass:$rd),
- (ins VR:$rs2, sew:$sew), []>,
+ RISCVVPseudo<(outs f.fprclass:$rd), (ins VR:$rs2, sew:$sew)>,
Sched<[WriteVMovFS, ReadVMovFS]>;
let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F, isReMaterializable = 1,
Constraints = "$rd = $passthru" in
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
index 1bb67f4..c75addd9 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
@@ -11,6 +11,20 @@
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
+// RISC-V specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDT_NDS_FMV_BF16_X
+ : SDTypeProfile<1, 1, [SDTCisVT<0, bf16>, SDTCisVT<1, XLenVT>]>;
+def SDT_NDS_FMV_X_ANYEXTBF16
+ : SDTypeProfile<1, 1, [SDTCisVT<0, XLenVT>, SDTCisVT<1, bf16>]>;
+
+def riscv_nds_fmv_bf16_x
+ : SDNode<"RISCVISD::NDS_FMV_BF16_X", SDT_NDS_FMV_BF16_X>;
+def riscv_nds_fmv_x_anyextbf16
+ : SDNode<"RISCVISD::NDS_FMV_X_ANYEXTBF16", SDT_NDS_FMV_X_ANYEXTBF16>;
+
+//===----------------------------------------------------------------------===//
// Operand and SDNode transformation definitions.
//===----------------------------------------------------------------------===//
@@ -773,6 +787,25 @@ def : Pat<(bf16 (fpround FPR32:$rs)),
(NDS_FCVT_BF16_S FPR32:$rs)>;
} // Predicates = [HasVendorXAndesBFHCvt]
+let isCodeGenOnly = 1 in {
+def NDS_FMV_BF16_X : FPUnaryOp_r<0b1111000, 0b00000, 0b000, FPR16, GPR, "fmv.w.x">,
+ Sched<[WriteFMovI32ToF32, ReadFMovI32ToF32]>;
+def NDS_FMV_X_BF16 : FPUnaryOp_r<0b1110000, 0b00000, 0b000, GPR, FPR16, "fmv.x.w">,
+ Sched<[WriteFMovF32ToI32, ReadFMovF32ToI32]>;
+}
+
+let Predicates = [HasVendorXAndesBFHCvt] in {
+def : Pat<(riscv_nds_fmv_bf16_x GPR:$src), (NDS_FMV_BF16_X GPR:$src)>;
+def : Pat<(riscv_nds_fmv_x_anyextbf16 (bf16 FPR16:$src)),
+ (NDS_FMV_X_BF16 (bf16 FPR16:$src))>;
+} // Predicates = [HasVendorXAndesBFHCvt]
+
+// Use flh/fsh to load/store bf16 if zfh is enabled.
+let Predicates = [HasStdExtZfh, HasVendorXAndesBFHCvt] in {
+def : LdPat<load, FLH, bf16>;
+def : StPat<store, FSH, FPR16, bf16>;
+} // Predicates = [HasStdExtZfh, HasVendorXAndesBFHCvt]
+
let Predicates = [HasVendorXAndesVBFHCvt] in {
defm PseudoNDS_VFWCVT_S_BF16 : VPseudoVWCVT_S_BF16;
defm PseudoNDS_VFNCVT_BF16_S : VPseudoVNCVT_BF16_S;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
index f173440..ed1a60a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
@@ -291,31 +291,31 @@ def : CompressPat<(MUL GPRC:$rs1, GPRC:$rs2, GPRC:$rs1),
let Predicates = [HasStdExtZcb, HasStdExtZbb] in{
def : CompressPat<(SEXT_B GPRC:$rs1, GPRC:$rs1),
- (C_SEXT_B GPRC:$rs1, GPRC:$rs1)>;
+ (C_SEXT_B GPRC:$rs1)>;
def : CompressPat<(SEXT_H GPRC:$rs1, GPRC:$rs1),
- (C_SEXT_H GPRC:$rs1, GPRC:$rs1)>;
+ (C_SEXT_H GPRC:$rs1)>;
} // Predicates = [HasStdExtZcb, HasStdExtZbb]
let Predicates = [HasStdExtZcb, HasStdExtZbb] in{
def : CompressPat<(ZEXT_H_RV32 GPRC:$rs1, GPRC:$rs1),
- (C_ZEXT_H GPRC:$rs1, GPRC:$rs1)>;
+ (C_ZEXT_H GPRC:$rs1)>;
def : CompressPat<(ZEXT_H_RV64 GPRC:$rs1, GPRC:$rs1),
- (C_ZEXT_H GPRC:$rs1, GPRC:$rs1)>;
+ (C_ZEXT_H GPRC:$rs1)>;
} // Predicates = [HasStdExtZcb, HasStdExtZbb]
let Predicates = [HasStdExtZcb] in{
def : CompressPat<(ANDI GPRC:$rs1, GPRC:$rs1, 255),
- (C_ZEXT_B GPRC:$rs1, GPRC:$rs1)>;
+ (C_ZEXT_B GPRC:$rs1)>;
} // Predicates = [HasStdExtZcb]
let Predicates = [HasStdExtZcb, HasStdExtZba, IsRV64] in{
def : CompressPat<(ADD_UW GPRC:$rs1, GPRC:$rs1, X0),
- (C_ZEXT_W GPRC:$rs1, GPRC:$rs1)>;
+ (C_ZEXT_W GPRC:$rs1)>;
} // Predicates = [HasStdExtZcb, HasStdExtZba, IsRV64]
let Predicates = [HasStdExtZcb] in{
def : CompressPat<(XORI GPRC:$rs1, GPRC:$rs1, -1),
- (C_NOT GPRC:$rs1, GPRC:$rs1)>;
+ (C_NOT GPRC:$rs1)>;
}
let Predicates = [HasStdExtZcb] in{
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index 0565fcd..30d8f85 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -224,10 +224,10 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes);
Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
- // Note: Same VL as above, but i32 not xlen due to signature of
- // vp.strided.load
- VL = Builder.CreateElementCount(Builder.getInt32Ty(),
- VTy->getElementCount());
+ // For rv64, need to truncate i64 to i32 to match signature. As VL is at most
+ // the number of active lanes (which is bounded by i32) this is safe.
+ VL = Builder.CreateTrunc(VL, Builder.getInt32Ty());
+
CallInst *CI =
Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
{VTy, BasePtr->getType(), Stride->getType()},
@@ -302,10 +302,9 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store,
Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes);
Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
- // Note: Same VL as above, but i32 not xlen due to signature of
- // vp.strided.store
- VL = Builder.CreateElementCount(Builder.getInt32Ty(),
- VTy->getElementCount());
+ // For rv64, need to truncate i64 to i32 to match signature. As VL is at
+ // most the number of active lanes (which is bounded by i32) this is safe.
+ VL = Builder.CreateTrunc(VL, Builder.getInt32Ty());
CallInst *CI =
Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store,
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index e87f452..ccb39e8 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -268,6 +268,11 @@ def GPRNoX0 : GPRRegisterClass<(sub GPR, X0)> {
let DiagnosticString = "register must be a GPR excluding zero (x0)";
}
+def GPRNoX2 : GPRRegisterClass<(sub GPR, X2)> {
+ let DiagnosticType = "InvalidRegClassGPRNoX2";
+ let DiagnosticString = "register must be a GPR excluding sp (x2)";
+}
+
def GPRNoX0X2 : GPRRegisterClass<(sub GPR, X0, X2)> {
let DiagnosticType = "InvalidRegClassGPRNoX0X2";
let DiagnosticString = "register must be a GPR excluding zero (x0) and sp (x2)";
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
index 3e286a7..bf23812 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
@@ -24,6 +24,67 @@ class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0
bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
}
+defvar SMX60VLEN = 256;
+defvar SMX60DLEN = !div(SMX60VLEN, 2);
+
+class Get1248Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M2") : 2,
+ !eq(mx, "M4") : 4,
+ !eq(mx, "M8") : 8,
+ true: 1
+ );
+}
+
+// Used for: logical opsz, shifts, sign ext, merge/move, FP sign/recip/convert, mask ops, slides
+class Get4816Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M4") : 8,
+ !eq(mx, "M8") : 16,
+ true: 4
+ );
+}
+
+// Used for: arithmetic (add/sub/min/max), saturating/averaging, FP add/sub/min/max
+class Get458Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M4") : 5,
+ !eq(mx, "M8") : 8,
+ true: 4
+ );
+}
+
+// Widening scaling pattern (4,4,4,4,5,8,8): plateaus at higher LMULs
+// Used for: widening operations
+class Get4588Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M2") : 5,
+ !eq(mx, "M4") : 8,
+ !eq(mx, "M8") : 8, // M8 not supported for most widening, fallback
+ true: 4
+ );
+}
+
+// Used for: mask-producing comparisons, carry ops with mask, FP comparisons
+class Get461018Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M2") : 6,
+ !eq(mx, "M4") : 10,
+ !eq(mx, "M8") : 18,
+ true: 4
+ );
+}
+
+// Used for: e64 multiply pattern, complex ops
+class Get781632Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M2") : 8,
+ !eq(mx, "M4") : 16,
+ !eq(mx, "M8") : 32,
+ true: 7
+ );
+}
+
def SpacemitX60Model : SchedMachineModel {
let IssueWidth = 2; // dual-issue
let MicroOpBufferSize = 0; // in-order
@@ -322,58 +383,96 @@ foreach LMul = [1, 2, 4, 8] in {
foreach mx = SchedMxList in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
- defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>;
-
- defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>;
-
- defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+ let Latency = Get458Latency<mx>.c, ReleaseAtCycles = [4] in {
+ defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
+ }
+
+ let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [4] in {
+ // Pattern of vadd, vsub, vrsub: 4/4/5/8
+ // Pattern of vand, vor, vxor: 4/4/8/16
+ // They are grouped together, so we used the worst case 4/4/8/16
+ // TODO: use InstRW to override individual instructions' scheduling data
+ defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>;
+ }
+
+ let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [4] in {
+ defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>;
+ }
+
+ // Pattern of vmacc, vmadd, vmul, vmulh, etc.: e8/e16 = 4/4/5/8, e32 = 5,5,5,8,
+ // e64 = 7,8,16,32. We use the worst-case until we can split the SEW.
+ // TODO: change WriteVIMulV, etc to be defined with LMULSEWSchedWrites
+ let Latency = Get781632Latency<mx>.c, ReleaseAtCycles = [7] in {
+ defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+ }
}
// Widening
+// Pattern of vwmul, vwmacc, etc: e8/e16 = 4/4/5/8, e32 = 5,5,5,8
+// We use the worst-case for all.
foreach mx = SchedMxListW in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
- defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+ let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [4] in {
+ defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+ }
}
-// Vector Integer Division and Remainder
+// Division and remainder operations
+// Pattern of vdivu: 11/11/11/20/40/80/160
+// Pattern of vdiv: 12/12/12/22/44/88/176
+// Pattern of vremu: 12/12/12/22/44/88/176
+// Pattern of vrem: 13/13/13/24/48/96/192
+// We use for all: 12/12/12/24/48/96/192
+// TODO: Create separate WriteVIRem to more closely match the latencies
foreach mx = SchedMxList in {
foreach sew = SchedSEWSet<mx>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
- defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ // Slightly reduced for fractional LMULs
+ defvar Multiplier = !cond(
+ !eq(mx, "MF8") : 12,
+ !eq(mx, "MF4") : 12,
+ !eq(mx, "MF2") : 12,
+ true: 24
+ );
+
+ let Latency = !mul(Get1248Latency<mx>.c, Multiplier), ReleaseAtCycles = [12] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ }
}
}
@@ -381,12 +480,21 @@ foreach mx = SchedMxList in {
foreach mx = SchedMxListW in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
- defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>;
+ // Slightly increased for integer LMULs
+ defvar Multiplier = !cond(
+ !eq(mx, "M2") : 2,
+ !eq(mx, "M4") : 2,
+ true: 1
+ );
+
+ let Latency = !mul(Get4816Latency<mx>.c, Multiplier), ReleaseAtCycles = [4] in {
+ defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>;
+ }
}
// 12. Vector Fixed-Point Arithmetic Instructions
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index b43b915..da6ac2f 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -104,11 +104,6 @@ static cl::opt<bool> EnablePostMISchedLoadStoreClustering(
cl::desc("Enable PostRA load and store clustering in the machine scheduler"),
cl::init(true));
-static cl::opt<bool>
- EnableVLOptimizer("riscv-enable-vl-optimizer",
- cl::desc("Enable the RISC-V VL Optimizer pass"),
- cl::init(true), cl::Hidden);
-
static cl::opt<bool> DisableVectorMaskMutation(
"riscv-disable-vector-mask-mutation",
cl::desc("Disable the vector mask scheduling mutation"), cl::init(false),
@@ -617,8 +612,7 @@ void RISCVPassConfig::addPreRegAlloc() {
addPass(createRISCVPreRAExpandPseudoPass());
if (TM->getOptLevel() != CodeGenOptLevel::None) {
addPass(createRISCVMergeBaseOffsetOptPass());
- if (EnableVLOptimizer)
- addPass(createRISCVVLOptimizerPass());
+ addPass(createRISCVVLOptimizerPass());
}
addPass(createRISCVInsertReadWriteCSRPass());
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index b53d919..c946451 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -114,14 +114,6 @@ FunctionPass *llvm::createRISCVVLOptimizerPass() {
return new RISCVVLOptimizer();
}
-/// Return true if R is a physical or virtual vector register, false otherwise.
-static bool isVectorRegClass(Register R, const MachineRegisterInfo *MRI) {
- if (R.isPhysical())
- return RISCV::VRRegClass.contains(R);
- const TargetRegisterClass *RC = MRI->getRegClass(R);
- return RISCVRI::isVRegClass(RC->TSFlags);
-}
-
LLVM_ATTRIBUTE_UNUSED
static raw_ostream &operator<<(raw_ostream &OS, const OperandInfo &OI) {
OI.print(OS);
@@ -183,37 +175,28 @@ static unsigned getIntegerExtensionOperandEEW(unsigned Factor,
return Log2EEW;
}
-/// Check whether MO is a mask operand of MI.
-static bool isMaskOperand(const MachineInstr &MI, const MachineOperand &MO,
- const MachineRegisterInfo *MRI) {
-
- if (!MO.isReg() || !isVectorRegClass(MO.getReg(), MRI))
- return false;
-
- const MCInstrDesc &Desc = MI.getDesc();
- return Desc.operands()[MO.getOperandNo()].RegClass == RISCV::VMV0RegClassID;
-}
-
static std::optional<unsigned>
getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
const MachineInstr &MI = *MO.getParent();
+ const MCInstrDesc &Desc = MI.getDesc();
const RISCVVPseudosTable::PseudoInfo *RVV =
RISCVVPseudosTable::getPseudoInfo(MI.getOpcode());
assert(RVV && "Could not find MI in PseudoTable");
// MI has a SEW associated with it. The RVV specification defines
// the EEW of each operand and definition in relation to MI.SEW.
- unsigned MILog2SEW =
- MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
+ unsigned MILog2SEW = MI.getOperand(RISCVII::getSEWOpNum(Desc)).getImm();
- const bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(MI.getDesc());
- const bool IsTied = RISCVII::isTiedPseudo(MI.getDesc().TSFlags);
+ const bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(Desc);
+ const bool IsTied = RISCVII::isTiedPseudo(Desc.TSFlags);
bool IsMODef = MO.getOperandNo() == 0 ||
(HasPassthru && MO.getOperandNo() == MI.getNumExplicitDefs());
// All mask operands have EEW=1
- if (isMaskOperand(MI, MO, MRI))
+ const MCOperandInfo &Info = Desc.operands()[MO.getOperandNo()];
+ if (Info.OperandType == MCOI::OPERAND_REGISTER &&
+ Info.RegClass == RISCV::VMV0RegClassID)
return 0;
// switch against BaseInstr to reduce number of cases that need to be
@@ -1296,8 +1279,8 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
TII->get(RISCV::getRVVMCOpcode(MI.getOpcode())).TSFlags) &&
"Instruction shouldn't be supported if elements depend on VL");
- assert(MI.getOperand(0).isReg() &&
- isVectorRegClass(MI.getOperand(0).getReg(), MRI) &&
+ assert(RISCVRI::isVRegClass(
+ MRI->getRegClass(MI.getOperand(0).getReg())->TSFlags) &&
"All supported instructions produce a vector register result");
LLVM_DEBUG(dbgs() << "Found a candidate for VL reduction: " << MI << "\n");
diff --git a/llvm/lib/Target/SPIRV/SPIRVAPI.cpp b/llvm/lib/Target/SPIRV/SPIRVAPI.cpp
index bbf1d87..cfe7ef4 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAPI.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAPI.cpp
@@ -116,8 +116,8 @@ SPIRVTranslate(Module *M, std::string &SpirvObj, std::string &ErrMsg,
PM.add(new TargetLibraryInfoWrapperPass(TLII));
std::unique_ptr<MachineModuleInfoWrapperPass> MMIWP(
new MachineModuleInfoWrapperPass(Target.get()));
- const_cast<TargetLoweringObjectFile *>(Target->getObjFileLowering())
- ->Initialize(MMIWP->getMMI().getContext(), *Target);
+ Target->getObjFileLowering()->Initialize(MMIWP->getMMI().getContext(),
+ *Target);
SmallString<4096> OutBuffer;
raw_svector_ostream OutStream(OutBuffer);
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index b90e1aa..3c631ce 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -665,10 +665,10 @@ Type *SPIRVEmitIntrinsics::deduceElementTypeHelper(
auto *HandleType = cast<TargetExtType>(II->getOperand(0)->getType());
if (HandleType->getTargetExtName() == "spirv.Image" ||
HandleType->getTargetExtName() == "spirv.SignedImage") {
- if (II->hasOneUse()) {
- auto *U = *II->users().begin();
+ for (User *U : II->users()) {
Ty = cast<Instruction>(U)->getAccessType();
- assert(Ty && "Unable to get type for resource pointer.");
+ if (Ty)
+ break;
}
} else if (HandleType->getTargetExtName() == "spirv.VulkanBuffer") {
// This call is supposed to index into an array
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
index 4a9c88b..a95c4ff 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -7,7 +7,6 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/SparcFixupKinds.h"
-#include "MCTargetDesc/SparcMCAsmInfo.h"
#include "MCTargetDesc/SparcMCTargetDesc.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCELFObjectWriter.h"
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index 1ee6e80..79da53e 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -13,10 +13,7 @@
#include "MCTargetDesc/SparcMCAsmInfo.h"
#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCObjectStreamer.h"
-#include "llvm/MC/MCValue.h"
using namespace llvm;
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp
index 3ef6030..72bb372 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp
@@ -69,8 +69,8 @@ void SystemZHLASMAsmStreamer::EmitEOL() {
void SystemZHLASMAsmStreamer::changeSection(MCSection *Section,
uint32_t Subsection) {
- Section->printSwitchToSection(*MAI, getContext().getTargetTriple(), OS,
- Subsection);
+ MAI->printSwitchToSection(*Section, Subsection,
+ getContext().getTargetTriple(), OS);
MCStreamer::changeSection(Section, Subsection);
}
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 19c9e9c..6ae69a4 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -900,7 +900,8 @@ public:
bool checkDataSection() {
if (CurrentState != DataSection) {
- auto *WS = cast<MCSectionWasm>(getStreamer().getCurrentSectionOnly());
+ auto *WS = static_cast<const MCSectionWasm *>(
+ getStreamer().getCurrentSectionOnly());
if (WS && WS->isText())
return error("data directive must occur in a data segment: ",
Lexer.getTok());
@@ -1218,7 +1219,8 @@ public:
void doBeforeLabelEmit(MCSymbol *Symbol, SMLoc IDLoc) override {
// Code below only applies to labels in text sections.
- auto *CWS = cast<MCSectionWasm>(getStreamer().getCurrentSectionOnly());
+ auto *CWS = static_cast<const MCSectionWasm *>(
+ getStreamer().getCurrentSectionOnly());
if (!CWS->isText())
return;
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.td b/llvm/lib/Target/WebAssembly/WebAssembly.td
index 13603f8..a606209 100644
--- a/llvm/lib/Target/WebAssembly/WebAssembly.td
+++ b/llvm/lib/Target/WebAssembly/WebAssembly.td
@@ -71,6 +71,7 @@ def FeatureReferenceTypes :
SubtargetFeature<"reference-types", "HasReferenceTypes", "true",
"Enable reference types">;
+def FeatureGC : SubtargetFeature<"gc", "HasGC", "true", "Enable wasm gc">;
def FeatureRelaxedSIMD :
SubtargetFeature<"relaxed-simd", "SIMDLevel", "RelaxedSIMD",
"Enable relaxed-simd instructions">;
@@ -136,13 +137,13 @@ def : ProcessorModel<"lime1", NoSchedModel,
// Latest and greatest experimental version of WebAssembly. Bugs included!
def : ProcessorModel<"bleeding-edge", NoSchedModel,
- [FeatureAtomics, FeatureBulkMemory, FeatureBulkMemoryOpt,
- FeatureCallIndirectOverlong, FeatureExceptionHandling,
- FeatureExtendedConst, FeatureFP16, FeatureMultiMemory,
- FeatureMultivalue, FeatureMutableGlobals,
- FeatureNontrappingFPToInt, FeatureRelaxedSIMD,
- FeatureReferenceTypes, FeatureSIMD128, FeatureSignExt,
- FeatureTailCall]>;
+ [FeatureAtomics, FeatureBulkMemory, FeatureBulkMemoryOpt,
+ FeatureCallIndirectOverlong, FeatureExceptionHandling,
+ FeatureExtendedConst, FeatureFP16, FeatureMultiMemory,
+ FeatureMultivalue, FeatureMutableGlobals,
+ FeatureNontrappingFPToInt, FeatureRelaxedSIMD,
+ FeatureReferenceTypes, FeatureGC, FeatureSIMD128,
+ FeatureSignExt, FeatureTailCall]>;
//===----------------------------------------------------------------------===//
// Target Declaration
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index fa5776c..b03b350 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -123,7 +123,7 @@ static SDValue getTagSymNode(int Tag, SelectionDAG *DAG) {
static APInt encodeFunctionSignature(SelectionDAG *DAG, SDLoc &DL,
SmallVector<MVT, 4> &Returns,
SmallVector<MVT, 4> &Params) {
- auto toWasmValType = [&](MVT VT) {
+ auto toWasmValType = [](MVT VT) {
if (VT == MVT::i32) {
return wasm::ValType::I32;
}
@@ -244,10 +244,18 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
MCSymbol *Table = WebAssembly::getOrCreateFunctionTableSymbol(
MF.getContext(), Subtarget);
SDValue TableSym = CurDAG->getMCSymbol(Table, PtrVT);
- SDValue FuncRef = SDValue(
- CurDAG->getMachineNode(WebAssembly::TABLE_GET_FUNCREF, DL,
- MVT::funcref, TableSym, Node->getOperand(1)),
- 0);
+ SDValue FuncPtr = Node->getOperand(1);
+ if (Subtarget->hasAddr64() && FuncPtr.getValueType() == MVT::i64) {
+ // table.get expects an i32 but on 64 bit platforms the function pointer
+ // is an i64. In that case, i32.wrap_i64 to convert.
+ FuncPtr = SDValue(CurDAG->getMachineNode(WebAssembly::I32_WRAP_I64, DL,
+ MVT::i32, FuncPtr),
+ 0);
+ }
+ SDValue FuncRef =
+ SDValue(CurDAG->getMachineNode(WebAssembly::TABLE_GET_FUNCREF, DL,
+ MVT::funcref, TableSym, FuncPtr),
+ 0);
// Encode the signature information into the type index placeholder.
// This gets decoded and converted into the actual type signature in
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 11936a3..cd434f7 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -288,7 +288,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
// Expand float operations supported for scalars but not SIMD
for (auto Op : {ISD::FCOPYSIGN, ISD::FLOG, ISD::FLOG2, ISD::FLOG10,
- ISD::FEXP, ISD::FEXP2})
+ ISD::FEXP, ISD::FEXP2, ISD::FEXP10})
for (auto T : {MVT::v4f32, MVT::v2f64})
setOperationAction(Op, T, Expand);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index b5e723e..2b632fd 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -76,6 +76,9 @@ def HasReferenceTypes :
Predicate<"Subtarget->hasReferenceTypes()">,
AssemblerPredicate<(all_of FeatureReferenceTypes), "reference-types">;
+def HasGC : Predicate<"Subtarget->hasGC()">,
+ AssemblerPredicate<(all_of FeatureGC), "gc">;
+
def HasRelaxedSIMD :
Predicate<"Subtarget->hasRelaxedSIMD()">,
AssemblerPredicate<(all_of FeatureRelaxedSIMD), "relaxed-simd">;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
index 40b87a0..fc82e5b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
@@ -36,13 +36,10 @@ multiclass REF_I<WebAssemblyRegClass rc, ValueType vt, string ht> {
Requires<[HasReferenceTypes]>;
}
-defm REF_TEST_FUNCREF :
- I<(outs I32: $res),
- (ins TypeIndex:$type, FUNCREF: $ref),
- (outs),
- (ins TypeIndex:$type),
- [],
- "ref.test\t$type, $ref", "ref.test $type", 0xfb14>;
+defm REF_TEST_FUNCREF : I<(outs I32:$res), (ins TypeIndex:$type, FUNCREF:$ref),
+ (outs), (ins TypeIndex:$type), [],
+ "ref.test\t$type, $ref", "ref.test $type", 0xfb14>,
+ Requires<[HasGC]>;
defm "" : REF_I<FUNCREF, funcref, "func">;
defm "" : REF_I<EXTERNREF, externref, "extern">;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
index 7912aeb..ffd135d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
@@ -63,8 +63,10 @@ void OptimizeReturned::visitCallBase(CallBase &CB) {
if (isa<Constant>(Arg))
continue;
// Like replaceDominatedUsesWith but using Instruction/Use dominance.
- Arg->replaceUsesWithIf(&CB,
- [&](Use &U) { return DT->dominates(&CB, U); });
+ Arg->replaceUsesWithIf(&CB, [&](Use &U) {
+ auto *I = cast<Instruction>(U.getUser());
+ return !I->isLifetimeStartOrEnd() && DT->dominates(&CB, U);
+ });
}
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
index 40ea48a..a3ce40f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
@@ -43,6 +43,11 @@ WebAssemblySubtarget::initializeSubtargetDependencies(StringRef CPU,
Bits.set(WebAssembly::FeatureBulkMemoryOpt);
}
+ // gc implies reference-types
+ if (HasGC) {
+ HasReferenceTypes = true;
+ }
+
// reference-types implies call-indirect-overlong
if (HasReferenceTypes) {
HasCallIndirectOverlong = true;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
index 591ce256..f814274 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
@@ -51,6 +51,7 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo {
bool HasMutableGlobals = false;
bool HasNontrappingFPToInt = false;
bool HasReferenceTypes = false;
+ bool HasGC = false;
bool HasSignExt = false;
bool HasTailCall = false;
bool HasWideArithmetic = false;
@@ -107,6 +108,7 @@ public:
bool hasMutableGlobals() const { return HasMutableGlobals; }
bool hasNontrappingFPToInt() const { return HasNontrappingFPToInt; }
bool hasReferenceTypes() const { return HasReferenceTypes; }
+ bool hasGC() const { return HasGC; }
bool hasRelaxedSIMD() const { return SIMDLevel >= RelaxedSIMD; }
bool hasSignExt() const { return HasSignExt; }
bool hasSIMD128() const { return SIMDLevel >= SIMD128; }
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index b642c1c..d7671ed 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1042,8 +1042,8 @@ private:
}
PrevState = CurrState;
}
- void onRParen() {
- PrevState = State;
+ bool onRParen(StringRef &ErrMsg) {
+ IntelExprState CurrState = State;
switch (State) {
default:
State = IES_ERROR;
@@ -1054,9 +1054,27 @@ private:
case IES_RBRAC:
case IES_RPAREN:
State = IES_RPAREN;
+ // In the case of a multiply, onRegister has already set IndexReg
+ // directly, with appropriate scale.
+ // Otherwise if we just saw a register it has only been stored in
+ // TmpReg, so we need to store it into the state machine.
+ if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+ // If we already have a BaseReg, then assume this is the IndexReg with
+ // no explicit scale.
+ if (!BaseReg) {
+ BaseReg = TmpReg;
+ } else {
+ if (IndexReg)
+ return regsUseUpError(ErrMsg);
+ IndexReg = TmpReg;
+ Scale = 0;
+ }
+ }
IC.pushOperator(IC_RPAREN);
break;
}
+ PrevState = CurrState;
+ return false;
}
bool onOffset(const MCExpr *Val, SMLoc OffsetLoc, StringRef ID,
const InlineAsmIdentifierInfo &IDInfo,
@@ -2172,7 +2190,11 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
}
break;
case AsmToken::LParen: SM.onLParen(); break;
- case AsmToken::RParen: SM.onRParen(); break;
+ case AsmToken::RParen:
+ if (SM.onRParen(ErrMsg)) {
+ return Error(Tok.getLoc(), ErrMsg);
+ }
+ break;
}
if (SM.hadError())
return Error(Tok.getLoc(), "unknown token in expression");
@@ -4781,7 +4803,7 @@ bool X86AsmParser::parseDirectiveEven(SMLoc L) {
getStreamer().initSections(false, getSTI());
Section = getStreamer().getCurrentSectionOnly();
}
- if (Section->useCodeAlign())
+ if (getContext().getAsmInfo()->useCodeAlign(*Section))
getStreamer().emitCodeAlignment(Align(2), &getSTI(), 0);
else
getStreamer().emitValueToAlignment(Align(2), 0, 1, 0);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index e213923..7f9d474 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -388,36 +388,6 @@ static bool mayHaveInterruptDelaySlot(unsigned InstOpcode) {
return false;
}
-/// Check if the instruction to be emitted is right after any data.
-static bool
-isRightAfterData(MCFragment *CurrentFragment,
- const std::pair<MCFragment *, size_t> &PrevInstPosition) {
- MCFragment *F = CurrentFragment;
- // Since data is always emitted into a DataFragment, our check strategy is
- // simple here.
- // - If the fragment is a DataFragment
- // - If it's empty (section start or data after align), return false.
- // - If it's not the fragment where the previous instruction is,
- // returns true.
- // - If it's the fragment holding the previous instruction but its
- // size changed since the previous instruction was emitted into
- // it, returns true.
- // - Otherwise returns false.
- // - If the fragment is not a DataFragment, returns false.
- if (F->getKind() == MCFragment::FT_Data)
- return F->getFixedSize() && (F != PrevInstPosition.first ||
- F->getFixedSize() != PrevInstPosition.second);
-
- return false;
-}
-
-/// \returns the fragment size if it has instructions, otherwise returns 0.
-static size_t getSizeForInstFragment(const MCFragment *F) {
- if (!F || !F->hasInstructions())
- return 0;
- return F->getSize();
-}
-
/// Return true if we can insert NOP or prefixes automatically before the
/// the instruction to be emitted.
bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const {
@@ -441,9 +411,11 @@ bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const {
// semantic.
return false;
- if (isRightAfterData(OS.getCurrentFragment(), PrevInstPosition))
- // If this instruction follows any data, there is no clear
- // instruction boundary, inserting a nop/prefix would change semantic.
+ // If this instruction follows any data, there is no clear instruction
+ // boundary, inserting a nop/prefix would change semantic.
+ auto Offset = OS.getCurFragSize();
+ if (Offset && (OS.getCurrentFragment() != PrevInstPosition.first ||
+ Offset != PrevInstPosition.second))
return false;
return true;
@@ -552,7 +524,7 @@ void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS,
// Update PrevInstOpcode here, canPadInst() reads that.
MCFragment *CF = OS.getCurrentFragment();
PrevInstOpcode = Inst.getOpcode();
- PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF));
+ PrevInstPosition = std::make_pair(CF, OS.getCurFragSize());
if (!canPadBranches(OS))
return;
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index f5eeb3b..d691538 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -11,7 +11,6 @@
//===----------------------------------------------------------------------===//
#include "X86MCAsmInfo.h"
-#include "MCTargetDesc/X86MCExpr.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/Support/CommandLine.h"
diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def
index 620526ff..3f2a433 100644
--- a/llvm/lib/Target/X86/X86PassRegistry.def
+++ b/llvm/lib/Target/X86/X86PassRegistry.def
@@ -12,8 +12,52 @@
// NOTE: NO INCLUDE GUARD DESIRED!
+#ifndef DUMMY_FUNCTION_PASS
+#define DUMMY_FUNCTION_PASS(NAME, CREATE_PASS)
+#endif
+DUMMY_FUNCTION_PASS("lower-amx-intrinsics", X86LowerAMXIntrinsics(*this))
+DUMMY_FUNCTION_PASS("lower-amx-type", X86LowerAMXTypePass(*this))
+DUMMY_FUNCTION_PASS("x86-partial-reduction", X86PartialReduction())
+DUMMY_FUNCTION_PASS("x86-winehstate", WinEHStatePass())
+#undef DUMMY_FUNCTION_PASS
+
#ifndef MACHINE_FUNCTION_PASS
#define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)
#endif
MACHINE_FUNCTION_PASS("x86-isel", X86ISelDAGToDAGPass(*this))
#undef MACHINE_FUNCTION_PASS
+
+#ifndef DUMMY_MACHINE_FUNCTION_PASS
+#define DUMMY_MACHINE_FUNCTION_PASS(NAME, PASS_NAME)
+#endif
+DUMMY_MACHINE_FUNCTION_PASS("x86-avoid-SFB", X86AvoidSFBPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-avoid-trailing-call", X86AvoidTrailingCallPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-cf-opt", X86CallFrameOptimization())
+DUMMY_MACHINE_FUNCTION_PASS("x86-cmov-conversion", X86CmovConverterPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-codege", FPS())
+DUMMY_MACHINE_FUNCTION_PASS("x86-compress-evex", CompressEVEXPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-domain-reassignment", X86DomainReassignment())
+DUMMY_MACHINE_FUNCTION_PASS("x86-dyn-alloca-expander", X86DynAllocaExpander())
+DUMMY_MACHINE_FUNCTION_PASS("x86-execution-domain-fix", X86ExecutionDomainFix())
+DUMMY_MACHINE_FUNCTION_PASS("fastpretileconfig", X86FastPreTileConfig())
+DUMMY_MACHINE_FUNCTION_PASS("fasttileconfig", X86FastTileConfig())
+DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-LEAs", FixupLEAPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-bw-inst", FixupBWInstPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-inst-tuning", X86FixupInstTuningPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-setcc", X86FixupSetCCPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-vector-constants", X86FixupVectorConstantsPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-flags-copy-lowering", X86FlagsCopyLoweringPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-lower-tile-copy", X86LowerTileCopy())
+DUMMY_MACHINE_FUNCTION_PASS("x86-lvi-load", X86LoadValueInjectionLoadHardeningPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-lvi-ret", X86LoadValueInjectionRetHardeningPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-optimize-LEAs", X86OptimizeLEAPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-pseudo", X86ExpandPseudo())
+DUMMY_MACHINE_FUNCTION_PASS("x86-return-thunks", X86ReturnThunks())
+DUMMY_MACHINE_FUNCTION_PASS("x86-seses", X86SpeculativeExecutionSideEffectSuppression())
+DUMMY_MACHINE_FUNCTION_PASS("x86-slh", X86SpeculativeLoadHardeningPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-suppress-apx-for-relocation", X86SuppressAPXForRelocationPass())
+DUMMY_MACHINE_FUNCTION_PASS("tile-pre-config", X86PreTileConfig())
+DUMMY_MACHINE_FUNCTION_PASS("tileconfig", X86TileConfig())
+DUMMY_MACHINE_FUNCTION_PASS("x86-wineh-unwindv2", X86WinEHUnwindV2())
+DUMMY_MACHINE_FUNCTION_PASS("x86argumentstackrebase", X86ArgumentStackSlotPass())
+#undef DUMMY_MACHINE_FUNCTION_PASS
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 37a7b37..90791fc 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1838,14 +1838,15 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
return LT.first * *KindCost;
static const CostKindTblEntry AVX512BWShuffleTbl[] = {
- { TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
- { TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
- { TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
+ { TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
+ { TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
+ { TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
- { TTI::SK_Reverse, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw
- { TTI::SK_Reverse, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw
+ { TTI::SK_Reverse, MVT::v32i16, { 2, 6, 2, 4 } }, // vpermw
+ { TTI::SK_Reverse, MVT::v32f16, { 2, 6, 2, 4 } }, // vpermw
{ TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
- { TTI::SK_Reverse, MVT::v64i8, { 2, 2, 2, 2 } }, // pshufb + vshufi64x2
+ { TTI::SK_Reverse, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
+ { TTI::SK_Reverse, MVT::v64i8, { 2, 9, 2, 3 } }, // pshufb + vshufi64x2
{ TTI::SK_PermuteSingleSrc, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw
{ TTI::SK_PermuteSingleSrc, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw
@@ -1874,18 +1875,25 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
return LT.first * *KindCost;
static const CostKindTblEntry AVX512ShuffleTbl[] = {
- {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd
- {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
- {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq
- {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
- {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
- {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
- {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
-
- {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
- {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
- {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
- {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
+ {TTI::SK_Broadcast, MVT::v8f64, { 1, 3, 1, 1 } }, // vbroadcastsd
+ {TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 1 } }, // vbroadcastsd
+ {TTI::SK_Broadcast, MVT::v16f32, { 1, 3, 1, 1 } }, // vbroadcastss
+ {TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 1 } }, // vbroadcastss
+ {TTI::SK_Broadcast, MVT::v8i64, { 1, 3, 1, 1 } }, // vpbroadcastq
+ {TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 1 } }, // vpbroadcastq
+ {TTI::SK_Broadcast, MVT::v16i32, { 1, 3, 1, 1 } }, // vpbroadcastd
+ {TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 1 } }, // vpbroadcastd
+ {TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 1 } }, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 1 } }, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
+ {TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 1 }}, // vpbroadcastb
+
+ {TTI::SK_Reverse, MVT::v8f64, { 1, 5, 2, 3 } }, // vpermpd
+ {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 2, 3 } }, // vpermps
+ {TTI::SK_Reverse, MVT::v8i64, { 1, 5, 2, 3 } }, // vpermq
+ {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 2, 3 } }, // vpermd
{TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
{TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
{TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
@@ -1973,21 +1981,24 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
return LT.first * *KindCost;
static const CostKindTblEntry AVX2ShuffleTbl[] = {
- { TTI::SK_Broadcast, MVT::v4f64, { 1, 1, 1, 1 } }, // vbroadcastpd
- { TTI::SK_Broadcast, MVT::v8f32, { 1, 1, 1, 1 } }, // vbroadcastps
- { TTI::SK_Broadcast, MVT::v4i64, { 1, 1, 1, 1 } }, // vpbroadcastq
- { TTI::SK_Broadcast, MVT::v8i32, { 1, 1, 1, 1 } }, // vpbroadcastd
- { TTI::SK_Broadcast, MVT::v16i16, { 1, 1, 1, 1 } }, // vpbroadcastw
- { TTI::SK_Broadcast, MVT::v16f16, { 1, 1, 1, 1 } }, // vpbroadcastw
- { TTI::SK_Broadcast, MVT::v32i8, { 1, 1, 1, 1 } }, // vpbroadcastb
-
- { TTI::SK_Reverse, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermpd
- { TTI::SK_Reverse, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermps
- { TTI::SK_Reverse, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermq
- { TTI::SK_Reverse, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermd
- { TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vperm2i128 + pshufb
- { TTI::SK_Reverse, MVT::v16f16, { 2, 2, 2, 2 } }, // vperm2i128 + pshufb
- { TTI::SK_Reverse, MVT::v32i8, { 2, 2, 2, 2 } }, // vperm2i128 + pshufb
+ { TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 2 } }, // vbroadcastpd
+ { TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 2 } }, // vbroadcastps
+ { TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 2 } }, // vpbroadcastq
+ { TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 2 } }, // vpbroadcastd
+ { TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 2 } }, // vpbroadcastw
+ { TTI::SK_Broadcast, MVT::v8i16, { 1, 3, 1, 1 } }, // vpbroadcastw
+ { TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 2 } }, // vpbroadcastw
+ { TTI::SK_Broadcast, MVT::v8f16, { 1, 3, 1, 1 } }, // vpbroadcastw
+ { TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 2 } }, // vpbroadcastb
+ { TTI::SK_Broadcast, MVT::v16i8, { 1, 3, 1, 1 } }, // vpbroadcastb
+
+ { TTI::SK_Reverse, MVT::v4f64, { 1, 6, 1, 2 } }, // vpermpd
+ { TTI::SK_Reverse, MVT::v8f32, { 2, 7, 2, 4 } }, // vpermps
+ { TTI::SK_Reverse, MVT::v4i64, { 1, 6, 1, 2 } }, // vpermq
+ { TTI::SK_Reverse, MVT::v8i32, { 2, 7, 2, 4 } }, // vpermd
+ { TTI::SK_Reverse, MVT::v16i16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
+ { TTI::SK_Reverse, MVT::v16f16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
+ { TTI::SK_Reverse, MVT::v32i8, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
{ TTI::SK_Select, MVT::v16i16, { 1, 1, 1, 1 } }, // vpblendvb
{ TTI::SK_Select, MVT::v16f16, { 1, 1, 1, 1 } }, // vpblendvb
@@ -2077,23 +2088,23 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
return LT.first * *KindCost;
static const CostKindTblEntry AVX1ShuffleTbl[] = {
- {TTI::SK_Broadcast, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vpermilpd
- {TTI::SK_Broadcast, MVT::v8f32, {2,2,2,2}}, // vperm2f128 + vpermilps
- {TTI::SK_Broadcast, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vpermilpd
- {TTI::SK_Broadcast, MVT::v8i32, {2,2,2,2}}, // vperm2f128 + vpermilps
- {TTI::SK_Broadcast, MVT::v16i16, {3,3,3,3}}, // vpshuflw + vpshufd + vinsertf128
- {TTI::SK_Broadcast, MVT::v16f16, {3,3,3,3}}, // vpshuflw + vpshufd + vinsertf128
- {TTI::SK_Broadcast, MVT::v32i8, {2,2,2,2}}, // vpshufb + vinsertf128
-
- {TTI::SK_Reverse, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vpermilpd
- {TTI::SK_Reverse, MVT::v8f32, {2,2,2,2}}, // vperm2f128 + vpermilps
- {TTI::SK_Reverse, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vpermilpd
- {TTI::SK_Reverse, MVT::v8i32, {2,2,2,2}}, // vperm2f128 + vpermilps
- {TTI::SK_Reverse, MVT::v16i16, {4,4,4,4}}, // vextractf128 + 2*pshufb
+ {TTI::SK_Broadcast, MVT::v4f64, {2,3,2,3}}, // vperm2f128 + vpermilpd
+ {TTI::SK_Broadcast, MVT::v8f32, {2,3,2,3}}, // vperm2f128 + vpermilps
+ {TTI::SK_Broadcast, MVT::v4i64, {2,3,2,3}}, // vperm2f128 + vpermilpd
+ {TTI::SK_Broadcast, MVT::v8i32, {2,3,2,3}}, // vperm2f128 + vpermilps
+ {TTI::SK_Broadcast, MVT::v16i16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
+ {TTI::SK_Broadcast, MVT::v16f16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
+ {TTI::SK_Broadcast, MVT::v32i8, {3,4,3,6}}, // vpshufb + vinsertf128
+
+ {TTI::SK_Reverse, MVT::v4f64, {2,6,2,2}}, // vperm2f128 + vpermilpd
+ {TTI::SK_Reverse, MVT::v8f32, {2,7,2,4}}, // vperm2f128 + vpermilps
+ {TTI::SK_Reverse, MVT::v4i64, {2,6,2,2}}, // vperm2f128 + vpermilpd
+ {TTI::SK_Reverse, MVT::v8i32, {2,7,2,4}}, // vperm2f128 + vpermilps
+ {TTI::SK_Reverse, MVT::v16i16, {2,9,5,5}}, // vextractf128 + 2*pshufb
// + vinsertf128
- {TTI::SK_Reverse, MVT::v16f16, {4,4,4,4}}, // vextractf128 + 2*pshufb
+ {TTI::SK_Reverse, MVT::v16f16, {2,9,5,5}}, // vextractf128 + 2*pshufb
// + vinsertf128
- {TTI::SK_Reverse, MVT::v32i8, {4,4,4,4}}, // vextractf128 + 2*pshufb
+ {TTI::SK_Reverse, MVT::v32i8, {2,9,5,5}}, // vextractf128 + 2*pshufb
// + vinsertf128
{TTI::SK_Select, MVT::v4i64, {1,1,1,1}}, // vblendpd
@@ -2156,13 +2167,13 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
return LT.first * *KindCost;
static const CostKindTblEntry SSSE3ShuffleTbl[] = {
- {TTI::SK_Broadcast, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
- {TTI::SK_Broadcast, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
- {TTI::SK_Broadcast, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
+ {TTI::SK_Broadcast, MVT::v8i16, {1, 3, 2, 2}}, // pshufb
+ {TTI::SK_Broadcast, MVT::v8f16, {1, 3, 2, 2}}, // pshufb
+ {TTI::SK_Broadcast, MVT::v16i8, {1, 3, 2, 2}}, // pshufb
- {TTI::SK_Reverse, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
- {TTI::SK_Reverse, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
- {TTI::SK_Reverse, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
+ {TTI::SK_Reverse, MVT::v8i16, {1, 2, 1, 2}}, // pshufb
+ {TTI::SK_Reverse, MVT::v8f16, {1, 2, 1, 2}}, // pshufb
+ {TTI::SK_Reverse, MVT::v16i8, {1, 2, 1, 2}}, // pshufb
{TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
{TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
@@ -2192,16 +2203,16 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Broadcast, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
{TTI::SK_Broadcast, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
{TTI::SK_Broadcast, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
- {TTI::SK_Broadcast, MVT::v8i16, {2, 2, 2, 2}}, // pshuflw + pshufd
- {TTI::SK_Broadcast, MVT::v8f16, {2, 2, 2, 2}}, // pshuflw + pshufd
- {TTI::SK_Broadcast, MVT::v16i8, {3, 3, 3, 3}}, // unpck + pshuflw + pshufd
+ {TTI::SK_Broadcast, MVT::v8i16, {1, 2, 2, 2}}, // pshuflw + pshufd
+ {TTI::SK_Broadcast, MVT::v8f16, {1, 2, 2, 2}}, // pshuflw + pshufd
+ {TTI::SK_Broadcast, MVT::v16i8, {2, 3, 3, 4}}, // unpck + pshuflw + pshufd
{TTI::SK_Reverse, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
{TTI::SK_Reverse, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
{TTI::SK_Reverse, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
- {TTI::SK_Reverse, MVT::v8i16, {3, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
- {TTI::SK_Reverse, MVT::v8f16, {3, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
- {TTI::SK_Reverse, MVT::v16i8, {9, 9, 9, 9}}, // 2*pshuflw + 2*pshufhw
+ {TTI::SK_Reverse, MVT::v8i16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
+ {TTI::SK_Reverse, MVT::v8f16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
+ {TTI::SK_Reverse, MVT::v16i8, {5, 6,11,11}}, // 2*pshuflw + 2*pshufhw
// + 2*pshufd + 2*unpck + packus
{TTI::SK_Select, MVT::v2i64, {1, 1, 1, 1}}, // movsd