aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp66
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp77
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h3
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp5
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td43
-rw-r--r--llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp26
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.td3
-rw-r--r--llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp23
-rw-r--r--llvm/lib/Target/AArch64/AArch64StackTagging.cpp6
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.h3
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h2
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td63
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp61
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp359
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h43
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp254
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h24
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp317
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp239
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp25
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp25
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp64
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td144
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp183
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h1
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h35
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/R600TargetMachine.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h14
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp56
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrFormats.td1
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp17
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td7
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp36
-rw-r--r--llvm/lib/Target/AMDGPU/SMInstructions.td28
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp19
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h3
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td17
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td1
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td225
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp19
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.h3
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp2
-rw-r--r--llvm/lib/Target/BPF/BPF.h2
-rw-r--r--llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp1
-rw-r--r--llvm/lib/Target/DirectX/DXILDataScalarization.cpp46
-rw-r--r--llvm/lib/Target/DirectX/DXILPrepare.cpp23
-rw-r--r--llvm/lib/Target/DirectX/DXILRootSignature.cpp471
-rw-r--r--llvm/lib/Target/DirectX/DXILRootSignature.h11
-rw-r--r--llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp36
-rw-r--r--llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp57
-rw-r--r--llvm/lib/Target/Hexagon/HexagonInstrFormats.td1
-rw-r--r--llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td21
-rw-r--r--llvm/lib/Target/Hexagon/HexagonInstrFormatsV65.td5
-rw-r--r--llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td414
-rw-r--r--llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td642
-rw-r--r--llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp1
-rw-r--r--llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td179
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp5
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td14
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td17
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp10
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp11
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp110
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.h5
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrFuture.td40
-rw-r--r--llvm/lib/Target/PowerPC/PPCSubtarget.cpp2
-rw-r--r--llvm/lib/Target/PowerPC/PPCSubtarget.h3
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp19
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp6
-rw-r--r--llvm/lib/Target/RISCV/RISCVCallingConv.td14
-rw-r--r--llvm/lib/Target/RISCV/RISCVFeatures.td6
-rw-r--r--llvm/lib/Target/RISCV/RISCVFrameLowering.cpp53
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp70
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp32
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.h6
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoP.td105
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td460
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td19
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td15
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td42
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td10
-rw-r--r--llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp204
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td200
-rw-r--r--llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp12
-rw-r--r--llvm/lib/Target/RISCV/RISCVSubtarget.cpp6
-rw-r--r--llvm/lib/Target/RISCV/RISCVSubtarget.h6
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp28
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h4
-rw-r--r--llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp38
-rw-r--r--llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp9
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp6
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp2
-rw-r--r--llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp1
-rw-r--r--llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp3
-rw-r--r--llvm/lib/Target/Sparc/SparcISelLowering.cpp4
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp12
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp99
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp1
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp55
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h1
-rw-r--r--llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp28
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp1
-rw-r--r--llvm/lib/Target/X86/X86AsmPrinter.h1
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp24
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h3
-rw-r--r--llvm/lib/Target/X86/X86InterleavedAccess.cpp8
-rw-r--r--llvm/lib/Target/X86/X86MCInstLower.cpp208
123 files changed, 3500 insertions, 3050 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 12fc976..201bfe0 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1205,32 +1205,36 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
Register DstReg = MI.getOperand(0).getReg();
if (DstReg == MI.getOperand(3).getReg()) {
// Expand to BIT
- BuildMI(MBB, MBBI, MI.getDebugLoc(),
- TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BITv8i8
- : AArch64::BITv16i8))
- .add(MI.getOperand(0))
- .add(MI.getOperand(3))
- .add(MI.getOperand(2))
- .add(MI.getOperand(1));
+ auto I = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BITv8i8
+ : AArch64::BITv16i8))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(1));
+ transferImpOps(MI, I, I);
} else if (DstReg == MI.getOperand(2).getReg()) {
// Expand to BIF
- BuildMI(MBB, MBBI, MI.getDebugLoc(),
- TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BIFv8i8
- : AArch64::BIFv16i8))
- .add(MI.getOperand(0))
- .add(MI.getOperand(2))
- .add(MI.getOperand(3))
- .add(MI.getOperand(1));
+ auto I = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BIFv8i8
+ : AArch64::BIFv16i8))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(1));
+ transferImpOps(MI, I, I);
} else {
// Expand to BSL, use additional move if required
if (DstReg == MI.getOperand(1).getReg()) {
- BuildMI(MBB, MBBI, MI.getDebugLoc(),
- TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
- : AArch64::BSLv16i8))
- .add(MI.getOperand(0))
- .add(MI.getOperand(1))
- .add(MI.getOperand(2))
- .add(MI.getOperand(3));
+ auto I =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
+ : AArch64::BSLv16i8))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3));
+ transferImpOps(MI, I, I);
} else {
BuildMI(MBB, MBBI, MI.getDebugLoc(),
TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::ORRv8i8
@@ -1240,15 +1244,17 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
getRenamableRegState(MI.getOperand(0).isRenamable()))
.add(MI.getOperand(1))
.add(MI.getOperand(1));
- BuildMI(MBB, MBBI, MI.getDebugLoc(),
- TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
- : AArch64::BSLv16i8))
- .add(MI.getOperand(0))
- .addReg(DstReg,
- RegState::Kill |
- getRenamableRegState(MI.getOperand(0).isRenamable()))
- .add(MI.getOperand(2))
- .add(MI.getOperand(3));
+ auto I2 =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
+ : AArch64::BSLv16i8))
+ .add(MI.getOperand(0))
+ .addReg(DstReg,
+ RegState::Kill | getRenamableRegState(
+ MI.getOperand(0).isRenamable()))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3));
+ transferImpOps(MI, I2, I2);
}
}
MI.eraseFromParent();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f026726..ef3e8c8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -164,6 +164,9 @@ static cl::opt<bool> UseFEATCPACodegen(
/// Value type used for condition codes.
static const MVT MVT_CC = MVT::i32;
+/// Value type used for NZCV flags.
+static constexpr MVT FlagsVT = MVT::i32;
+
static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
AArch64::X3, AArch64::X4, AArch64::X5,
AArch64::X6, AArch64::X7};
@@ -3451,7 +3454,7 @@ static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &DL,
}
unsigned Opcode =
IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
- return DAG.getNode(Opcode, DL, {MVT::i32, MVT::Other}, {Chain, LHS, RHS});
+ return DAG.getNode(Opcode, DL, {FlagsVT, MVT::Other}, {Chain, LHS, RHS});
}
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
@@ -3465,7 +3468,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
}
- return DAG.getNode(AArch64ISD::FCMP, DL, MVT::i32, LHS, RHS);
+ return DAG.getNode(AArch64ISD::FCMP, DL, FlagsVT, LHS, RHS);
}
// The CMP instruction is just an alias for SUBS, and representing it as
@@ -3490,7 +3493,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
// (a.k.a. ANDS) except that the flags are only guaranteed to work for one
// of the signed comparisons.
const SDValue ANDSNode =
- DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, MVT_CC),
+ DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, FlagsVT),
LHS.getOperand(0), LHS.getOperand(1));
// Replace all users of (and X, Y) with newly generated (ands X, Y)
DAG.ReplaceAllUsesWith(LHS, ANDSNode);
@@ -3501,7 +3504,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
}
}
- return DAG.getNode(Opcode, DL, DAG.getVTList(VT, MVT_CC), LHS, RHS)
+ return DAG.getNode(Opcode, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS)
.getValue(1);
}
@@ -3597,7 +3600,7 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
- return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
+ return DAG.getNode(Opcode, DL, FlagsVT, LHS, RHS, NZCVOp, Condition, CCOp);
}
/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
@@ -4036,7 +4039,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
// Check that the result fits into a 32-bit integer.
- SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
+ SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
if (IsSigned) {
// cmp xreg, wreg, sxtw
SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
@@ -4059,12 +4062,12 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
DAG.getConstant(63, DL, MVT::i64));
// It is important that LowerBits is last, otherwise the arithmetic
// shift will not be folded into the compare (SUBS).
- SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+ SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
.getValue(1);
} else {
SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
- SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+ SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
Overflow =
DAG.getNode(AArch64ISD::SUBS, DL, VTs,
DAG.getConstant(0, DL, MVT::i64),
@@ -4075,7 +4078,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
} // switch (...)
if (Opc) {
- SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
+ SDVTList VTs = DAG.getVTList(Op->getValueType(0), FlagsVT);
// Emit the AArch64 operation with overflow check.
Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
@@ -4177,7 +4180,7 @@ static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) {
SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
SDValue Cmp =
- DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
+ DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Op0, Op1);
return Cmp.getValue(1);
}
@@ -4220,16 +4223,15 @@ static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG,
SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
SDLoc DL(Op);
- SDVTList VTs = DAG.getVTList(VT0, VT1);
- SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
+ SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, FlagsVT), OpLHS,
OpRHS, OpCarryIn);
SDValue OutFlag =
IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
: carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
- return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
+ return DAG.getMergeValues({Sum, OutFlag}, DL);
}
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
@@ -4254,8 +4256,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
Overflow =
DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow);
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
- return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Value, Overflow);
+ return DAG.getMergeValues({Value, Overflow}, DL);
}
// Prefetch operands are:
@@ -7037,9 +7038,8 @@ SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
Op.getOperand(0));
// Generate SUBS & CSEL.
- SDValue Cmp =
- DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
- Op.getOperand(0), DAG.getConstant(0, DL, VT));
+ SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
+ Op.getOperand(0), DAG.getConstant(0, DL, VT));
return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
Cmp.getValue(1));
@@ -11108,7 +11108,7 @@ SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
SDValue Carry = Op.getOperand(2);
// SBCS uses a carry not a borrow so the carry flag should be inverted first.
SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
- SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
+ SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, FlagsVT),
LHS, RHS, InvCarry);
EVT OpVT = Op.getValueType();
@@ -12441,10 +12441,10 @@ SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
// Get NZCV register. Only update chain when copyfrom is glued.
if (Glue.getNode()) {
- Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
+ Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT, Glue);
Chain = Glue.getValue(1);
} else
- Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
+ Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT);
// Extract CC code.
SDValue CC = getSETCC(Cond, Glue, DL, DAG);
@@ -17343,12 +17343,17 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
-bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
+ Value *LaneMask,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
+ auto *SI = dyn_cast<StoreInst>(Store);
+ if (!SI)
+ return false;
+ assert(!LaneMask && "Unexpected mask on store");
auto *VecTy = cast<FixedVectorType>(SVI->getType());
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
@@ -18015,11 +18020,14 @@ bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
unsigned ShlAmt = C2->getZExtValue();
if (auto ShouldADD = *N->user_begin();
ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
- if (auto ShouldLOAD = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
- unsigned ByteVT = ShouldLOAD->getMemoryVT().getSizeInBits() / 8;
- if ((1ULL << ShlAmt) == ByteVT &&
- isIndexedLoadLegal(ISD::PRE_INC, ShouldLOAD->getMemoryVT()))
- return false;
+ if (auto Load = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
+ EVT MemVT = Load->getMemoryVT();
+
+ if (Load->getValueType(0).isScalableVector())
+ return (8ULL << ShlAmt) != MemVT.getScalarSizeInBits();
+
+ if (isIndexedLoadLegal(ISD::PRE_INC, MemVT))
+ return (8ULL << ShlAmt) != MemVT.getFixedSizeInBits();
}
}
}
@@ -18588,7 +18596,7 @@ AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
Created.push_back(And.getNode());
} else {
SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
- SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ SDVTList VTs = DAG.getVTList(VT, FlagsVT);
SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
@@ -19477,10 +19485,10 @@ static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
// can select to CCMN to avoid the extra mov
SDValue AbsOp1 =
DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
- CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
- NZCVOp, Condition, Cmp0);
+ CCmp = DAG.getNode(AArch64ISD::CCMN, DL, FlagsVT, Cmp1.getOperand(0),
+ AbsOp1, NZCVOp, Condition, Cmp0);
} else {
- CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
+ CCmp = DAG.getNode(AArch64ISD::CCMP, DL, FlagsVT, Cmp1.getOperand(0),
Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
}
return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
@@ -25129,8 +25137,9 @@ static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG) {
if (!TReassocOp && !FReassocOp)
return SDValue();
- SDValue NewCmp = DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),
- DAG.getVTList(VT, MVT_CC), CmpOpOther, SubsOp);
+ SDValue NewCmp =
+ DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),
+ DAG.getVTList(VT, FlagsVT), CmpOpOther, SubsOp);
auto Reassociate = [&](SDValue ReassocOp, unsigned OpNum) {
if (!ReassocOp)
@@ -27156,7 +27165,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
: AArch64SysReg::RNDRRS);
SDLoc DL(N);
SDValue A = DAG.getNode(
- AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::i32, MVT::Other),
+ AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other),
N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
SDValue B = DAG.getNode(
AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 713793e..d8403c2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -215,7 +215,8 @@ public:
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
- bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ bool lowerInterleavedStore(Instruction *Store, Value *Mask,
+ ShuffleVectorInst *SVI,
unsigned Factor) const override;
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index bc57537..802e4a9 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -533,8 +533,9 @@ bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
MBP.LHS = LastInst->getOperand(0);
MBP.RHS = MachineOperand::CreateImm(0);
- MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
- : MachineBranchPredicate::PRED_EQ;
+ MBP.Predicate = (LastOpc == AArch64::CBNZX || LastOpc == AArch64::CBNZW)
+ ? MachineBranchPredicate::PRED_NE
+ : MachineBranchPredicate::PRED_EQ;
return false;
}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 9f8a257..9ebdf2e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -430,26 +430,27 @@ def UseWzrToVecMove : Predicate<"Subtarget->useWzrToVecMove()">;
def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2,
[SDTCisSameAs<0, 2>,
SDTCisSameAs<0, 3>,
- SDTCisInt<0>, SDTCisVT<1, i32>]>;
+ SDTCisInt<0>,
+ SDTCisVT<1, FlagsVT>]>;
// SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS
def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3,
[SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>,
SDTCisInt<0>,
- SDTCisVT<3, i32>]>;
+ SDTCisVT<3, FlagsVT>]>;
// SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS
def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
[SDTCisSameAs<0, 2>,
SDTCisSameAs<0, 3>,
SDTCisInt<0>,
- SDTCisVT<1, i32>,
- SDTCisVT<4, i32>]>;
+ SDTCisVT<1, FlagsVT>,
+ SDTCisVT<4, FlagsVT>]>;
def SDT_AArch64Brcond : SDTypeProfile<0, 3,
[SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>,
- SDTCisVT<2, i32>]>;
+ SDTCisVT<2, FlagsVT>]>;
def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
SDTCisVT<2, OtherVT>]>;
@@ -458,22 +459,22 @@ def SDT_AArch64CSel : SDTypeProfile<1, 4,
[SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>,
SDTCisInt<3>,
- SDTCisVT<4, i32>]>;
+ SDTCisVT<4, FlagsVT>]>;
def SDT_AArch64CCMP : SDTypeProfile<1, 5,
- [SDTCisVT<0, i32>,
+ [SDTCisVT<0, FlagsVT>,
SDTCisInt<1>,
SDTCisSameAs<1, 2>,
SDTCisInt<3>,
SDTCisInt<4>,
SDTCisVT<5, i32>]>;
def SDT_AArch64FCCMP : SDTypeProfile<1, 5,
- [SDTCisVT<0, i32>,
+ [SDTCisVT<0, FlagsVT>,
SDTCisFP<1>,
SDTCisSameAs<1, 2>,
SDTCisInt<3>,
SDTCisInt<4>,
SDTCisVT<5, i32>]>;
-def SDT_AArch64FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+def SDT_AArch64FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, FlagsVT>,
SDTCisFP<1>,
SDTCisSameAs<2, 1>]>;
def SDT_AArch64Rev : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>;
@@ -1124,10 +1125,10 @@ def AArch64probedalloca
SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
[SDNPHasChain, SDNPMayStore]>;
-// MRS, also sets the flags via a glue.
+// MRS, also sets the flags.
def AArch64mrs : SDNode<"AArch64ISD::MRS",
SDTypeProfile<2, 1, [SDTCisVT<0, i64>,
- SDTCisVT<1, i32>,
+ SDTCisVT<1, FlagsVT>,
SDTCisVT<2, i32>]>,
[SDNPHasChain]>;
@@ -3934,6 +3935,26 @@ defm LDRSW : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",
def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
(SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
+// load zero-extended i32, bitcast to f64
+def : Pat <(f64 (bitconvert (i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
+
+// load zero-extended i16, bitcast to f64
+def : Pat <(f64 (bitconvert (i64 (zextloadi16 (am_indexed32 GPR64sp:$Rn, uimm12s2:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+
+// load zero-extended i8, bitcast to f64
+def : Pat <(f64 (bitconvert (i64 (zextloadi8 (am_indexed32 GPR64sp:$Rn, uimm12s1:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+
+// load zero-extended i16, bitcast to f32
+def : Pat <(f32 (bitconvert (i32 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+ (SUBREG_TO_REG (i32 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+
+// load zero-extended i8, bitcast to f32
+def : Pat <(f32 (bitconvert (i32 (zextloadi8 (am_indexed16 GPR64sp:$Rn, uimm12s1:$offset))))),
+ (SUBREG_TO_REG (i32 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+
// Pre-fetch.
def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
[(AArch64Prefetch timm:$Rt,
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 0ddd17c..abcd550 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -8,8 +8,8 @@
//
// This pass performs below peephole optimizations on MIR level.
//
-// 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
-// MOVi64imm + ANDXrr ==> ANDXri + ANDXri
+// 1. MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
+// MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
//
// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
// MOVi64imm + ADDXrr ==> ANDXri + ANDXri
@@ -126,7 +126,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
template <typename T>
- bool visitAND(unsigned Opc, MachineInstr &MI);
+ bool visitAND(unsigned Opc, MachineInstr &MI, unsigned OtherOpc = 0);
bool visitORR(MachineInstr &MI);
bool visitCSEL(MachineInstr &MI);
bool visitINSERT(MachineInstr &MI);
@@ -194,12 +194,12 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
}
template <typename T>
-bool AArch64MIPeepholeOpt::visitAND(
- unsigned Opc, MachineInstr &MI) {
+bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI,
+ unsigned OtherOpc) {
// Try below transformation.
//
- // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
- // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
+ // MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
+ // MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
//
// The mov pseudo instruction could be expanded to multiple mov instructions
// later. Let's try to split the constant operand of mov instruction into two
@@ -208,10 +208,10 @@ bool AArch64MIPeepholeOpt::visitAND(
return splitTwoPartImm<T>(
MI,
- [Opc](T Imm, unsigned RegSize, T &Imm0,
- T &Imm1) -> std::optional<OpcodePair> {
+ [Opc, OtherOpc](T Imm, unsigned RegSize, T &Imm0,
+ T &Imm1) -> std::optional<OpcodePair> {
if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
- return std::make_pair(Opc, Opc);
+ return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc);
return std::nullopt;
},
[&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
@@ -864,6 +864,12 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
case AArch64::ANDXrr:
Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI);
break;
+ case AArch64::ANDSWrr:
+ Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI, AArch64::ANDSWri);
+ break;
+ case AArch64::ANDSXrr:
+ Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI, AArch64::ANDSXri);
+ break;
case AArch64::ORRWrs:
Changed |= visitORR(MI);
break;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index 61bf87f..1a7609b 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -305,7 +305,8 @@ def GPR64pi48 : RegisterOperand<GPR64, "printPostIncOperand<48>">;
def GPR64pi64 : RegisterOperand<GPR64, "printPostIncOperand<64>">;
// Condition code regclass.
-def CCR : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
+defvar FlagsVT = i32;
+def CCR : RegisterClass<"AArch64", [FlagsVT], 32, (add NZCV)> {
let CopyCost = -1; // Don't allow copying of status registers.
// CCR is not allocatable.
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index bafb8d0..8a5b5ba 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -32,10 +32,29 @@ AArch64SelectionDAGInfo::AArch64SelectionDAGInfo()
void AArch64SelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG,
const SDNode *N) const {
+ SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N);
+
#ifndef NDEBUG
+ // Some additional checks not yet implemented by verifyTargetNode.
+ constexpr MVT FlagsVT = MVT::i32;
switch (N->getOpcode()) {
- default:
- return SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N);
+ case AArch64ISD::SUBS:
+ assert(N->getValueType(1) == FlagsVT);
+ break;
+ case AArch64ISD::ADC:
+ case AArch64ISD::SBC:
+ assert(N->getOperand(2).getValueType() == FlagsVT);
+ break;
+ case AArch64ISD::ADCS:
+ case AArch64ISD::SBCS:
+ assert(N->getValueType(1) == FlagsVT);
+ assert(N->getOperand(2).getValueType() == FlagsVT);
+ break;
+ case AArch64ISD::CSEL:
+ case AArch64ISD::CSINC:
+ case AArch64ISD::BRCOND:
+ assert(N->getOperand(3).getValueType() == FlagsVT);
+ break;
case AArch64ISD::SADDWT:
case AArch64ISD::SADDWB:
case AArch64ISD::UADDWT:
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index 75c7dd9..f136a184 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -581,7 +581,6 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
// statement if return_twice functions are called.
bool StandardLifetime =
!SInfo.CallsReturnTwice &&
- SInfo.UnrecognizedLifetimes.empty() &&
memtag::isStandardLifetime(Info.LifetimeStart, Info.LifetimeEnd, DT, LI,
ClMaxLifetimes);
if (StandardLifetime) {
@@ -616,10 +615,5 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
memtag::annotateDebugRecords(Info, Tag);
}
- // If we have instrumented at least one alloca, all unrecognized lifetime
- // intrinsics have to go.
- for (auto *I : SInfo.UnrecognizedLifetimes)
- I->eraseFromParent();
-
return true;
}
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 2409cc8..0f4f012 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -534,7 +534,7 @@ unsigned AArch64Subtarget::classifyGlobalFunctionReference(
}
void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const {
+ const SchedRegion &Region) const {
// LNT run (at least on Cyclone) showed reasonably significant gains for
// bi-directional scheduling. 253.perlbmk.
Policy.OnlyTopDown = false;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 154db3c..061ed61 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -343,7 +343,8 @@ public:
}
void overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const override;
+ const SchedRegion &Region) const override;
+
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
SDep &Dep,
const TargetSchedModel *SchedModel) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 90d3d92..40f49da 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -249,7 +249,7 @@ static bool hasPossibleIncompatibleOps(const Function *F) {
return false;
}
-uint64_t AArch64TTIImpl::getFeatureMask(const Function &F) const {
+APInt AArch64TTIImpl::getFeatureMask(const Function &F) const {
StringRef AttributeStr =
isMultiversionedFunction(F) ? "fmv-features" : "target-features";
StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index b27eb2e..7f45177 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -89,7 +89,7 @@ public:
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call,
unsigned DefaultCallPenalty) const override;
- uint64_t getFeatureMask(const Function &F) const override;
+ APInt getFeatureMask(const Function &F) const override;
bool isMultiversionedFunction(const Function &F) const override;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 3d4a14b..1a9bce5 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -9,8 +9,6 @@
#include "AArch64MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 6076ac4..8b8fc8b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -149,6 +149,12 @@ def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts",
"Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions"
>;
+def FeatureFmaMixBF16Insts : SubtargetFeature<"fma-mix-bf16-insts",
+ "HasFmaMixBF16Insts",
+ "true",
+ "Has v_fma_mix_f32_bf16, v_fma_mixlo_bf16, v_fma_mixhi_bf16 instructions"
+>;
+
def FeatureIEEEMinimumMaximumInsts : SubtargetFeature<"ieee-minimum-maximum-insts",
"HasIEEEMinimumMaximumInsts",
"true",
@@ -167,6 +173,12 @@ def FeatureMinimum3Maximum3F16 : SubtargetFeature<"minimum3-maximum3-f16",
"Has v_minimum3_f16 and v_maximum3_f16 instructions"
>;
+def FeatureMin3Max3PKF16 : SubtargetFeature<"min3-max3-pkf16",
+ "HasMin3Max3PKF16",
+ "true",
+ "Has v_pk_min3_num_f16 and v_pk_max3_num_f16 instructions"
+>;
+
def FeatureMinimum3Maximum3PKF16 : SubtargetFeature<"minimum3-maximum3-pkf16",
"HasMinimum3Maximum3PKF16",
"true",
@@ -256,12 +268,24 @@ def FeatureInstFwdPrefetchBug : SubtargetFeature<"inst-fwd-prefetch-bug",
"S_INST_PREFETCH instruction causes shader to hang"
>;
+def FeatureVmemPrefInsts : SubtargetFeature<"vmem-pref-insts",
+ "HasVmemPrefInsts",
+ "true",
+ "Has flat_prefect_b8 and global_prefetch_b8 instructions"
+>;
+
def FeatureSafeSmemPrefetch : SubtargetFeature<"safe-smem-prefetch",
"HasSafeSmemPrefetch",
"true",
"SMEM prefetches do not fail on illegal address"
>;
+def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch",
+ "HasSafeCUPrefetch",
+ "true",
+ "VMEM CU scope prefetches do not fail on illegal address"
+>;
+
def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard",
"HasVcmpxExecWARHazard",
"true",
@@ -559,6 +583,12 @@ def FeatureBF16ConversionInsts : SubtargetFeature<"bf16-cvt-insts",
"Has bf16 conversion instructions"
>;
+def FeatureBF16PackedInsts : SubtargetFeature<"bf16-pk-insts",
+ "HasBF16PackedInsts",
+ "true",
+ "Has bf16 packed instructions (fma, add, mul, max, min)"
+>;
+
def FeatureVOP3P : SubtargetFeature<"vop3p",
"HasVOP3PInsts",
"true",
@@ -1349,6 +1379,10 @@ def FeatureLshlAddU64Inst
: SubtargetFeature<"lshl-add-u64-inst", "HasLshlAddU64Inst", "true",
"Has v_lshl_add_u64 instruction">;
+def FeatureAddSubU64Insts
+ : SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true",
+ "Has v_add_u64 and v_sub_u64 instructions">;
+
def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts",
"HasVMemToLDSLoad",
"true",
@@ -1989,7 +2023,10 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureTransposeLoadF4F6Insts,
FeatureBF16TransInsts,
FeatureBF16ConversionInsts,
+ FeatureBF16PackedInsts,
FeatureCvtPkF16F32Inst,
+ FeatureFmaMixBF16Insts,
+ FeatureMin3Max3PKF16,
FeatureMinimum3Maximum3PKF16,
FeaturePrngInst,
FeaturePermlane16Swap,
@@ -2002,7 +2039,9 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureFlatBufferGlobalAtomicFaddF64Inst,
FeatureMemoryAtomicFAddF32DenormalSupport,
FeatureKernargPreload,
+ FeatureVmemPrefInsts,
FeatureLshlAddU64Inst,
+ FeatureAddSubU64Insts,
FeatureLdsBarrierArriveAtomic,
FeatureSetPrioIncWgInst,
]>;
@@ -2349,6 +2388,10 @@ def HasMinimum3Maximum3F16 :
Predicate<"Subtarget->hasMinimum3Maximum3F16()">,
AssemblerPredicate<(all_of FeatureMinimum3Maximum3F16)>;
+def HasMin3Max3PKF16 :
+ Predicate<"Subtarget->hasMin3Max3PKF16()">,
+ AssemblerPredicate<(all_of FeatureMin3Max3PKF16)>;
+
def HasMinimum3Maximum3PKF16 :
Predicate<"Subtarget->hasMinimum3Maximum3PKF16()">,
AssemblerPredicate<(all_of FeatureMinimum3Maximum3PKF16)>;
@@ -2472,6 +2515,9 @@ def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
def HasBF16ConversionInsts : Predicate<"Subtarget->hasBF16ConversionInsts()">,
AssemblerPredicate<(all_of FeatureBF16ConversionInsts)>;
+def HasBF16PackedInsts : Predicate<"Subtarget->hasBF16PackedInsts()">,
+ AssemblerPredicate<(all_of FeatureBF16PackedInsts)>;
+
def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
AssemblerPredicate<(all_of FeatureVOP3P)>;
@@ -2519,6 +2565,14 @@ def HasFmaakFmamkF64Insts :
Predicate<"Subtarget->hasFmaakFmamkF64Insts()">,
AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+def HasPkAddMinMaxInsts :
+ Predicate<"Subtarget->hasPkAddMinMaxInsts()">,
+ AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+
+def HasPkMinMax3Insts :
+ Predicate<"Subtarget->hasPkMinMax3Insts()">,
+ AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+
def HasImageInsts : Predicate<"Subtarget->hasImageInsts()">,
AssemblerPredicate<(all_of FeatureImageInsts)>;
@@ -2565,6 +2619,9 @@ def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">,
AssemblerPredicate<(all_of FeatureFmaMixInsts)>;
+def HasFmaMixBF16Insts : Predicate<"Subtarget->hasFmaMixBF16Insts()">,
+ AssemblerPredicate<(all_of FeatureFmaMixBF16Insts)>;
+
def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">,
AssemblerPredicate<(all_of FeatureDLInsts)>;
@@ -2763,12 +2820,18 @@ def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">;
def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">,
AssemblerPredicate<(all_of FeatureXF32Insts)>;
+def HasVmemPrefInsts : Predicate<"Subtarget->hasVmemPrefInsts()">,
+ AssemblerPredicate<(all_of FeatureVmemPrefInsts)>;
+
def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">,
AssemblerPredicate<(all_of FeatureAshrPkInsts)>;
def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">,
AssemblerPredicate<(all_of FeatureLshlAddU64Inst)>;
+def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">,
+ AssemblerPredicate<(all_of FeatureAddSubU64Insts)>;
+
def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">,
AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index dedee46..49d8b44 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1383,7 +1383,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
&AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
&AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
&AAUnderlyingObjects::ID, &AANoAliasAddrSpace::ID, &AAAddressSpace::ID,
- &AAIndirectCallInfo::ID, &AAInstanceInfo::ID});
+ &AAIndirectCallInfo::ID});
AttributorConfig AC(CGUpdater);
AC.IsClosedWorldModule = Options.IsClosedWorld;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 891d362..108842f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -446,5 +446,8 @@ def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">,
def gi_as_hw_round_mode : GICustomOperandRenderer<"renderRoundMode">,
GISDNodeXFormEquiv<as_hw_round_mode>;
+def gi_prefetch_loc : GICustomOperandRenderer<"renderPrefetchLoc">,
+ GISDNodeXFormEquiv<PrefetchLoc>;
+
def gi_MFMALdScaleModifierOp : GICustomOperandRenderer<"renderScaledMAIIntrinsicOperand">,
GISDNodeXFormEquiv<MFMALdScaleXForm>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index 00979f4..f36935d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -117,45 +117,72 @@ static LLT getReadAnyLaneSplitTy(LLT Ty) {
return LLT::scalar(32);
}
-static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
- const RegisterBankInfo &RBI);
-
-static void unmergeReadAnyLane(MachineIRBuilder &B,
- SmallVectorImpl<Register> &SgprDstParts,
- LLT UnmergeTy, Register VgprSrc,
- const RegisterBankInfo &RBI) {
+template <typename ReadLaneFnTy>
+static Register buildReadLane(MachineIRBuilder &, Register,
+ const RegisterBankInfo &, ReadLaneFnTy);
+
+template <typename ReadLaneFnTy>
+static void
+unmergeReadAnyLane(MachineIRBuilder &B, SmallVectorImpl<Register> &SgprDstParts,
+ LLT UnmergeTy, Register VgprSrc, const RegisterBankInfo &RBI,
+ ReadLaneFnTy BuildRL) {
const RegisterBank *VgprRB = &RBI.getRegBank(AMDGPU::VGPRRegBankID);
auto Unmerge = B.buildUnmerge({VgprRB, UnmergeTy}, VgprSrc);
for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) {
- SgprDstParts.push_back(buildReadAnyLane(B, Unmerge.getReg(i), RBI));
+ SgprDstParts.push_back(buildReadLane(B, Unmerge.getReg(i), RBI, BuildRL));
}
}
-static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
- const RegisterBankInfo &RBI) {
+template <typename ReadLaneFnTy>
+static Register buildReadLane(MachineIRBuilder &B, Register VgprSrc,
+ const RegisterBankInfo &RBI,
+ ReadLaneFnTy BuildRL) {
LLT Ty = B.getMRI()->getType(VgprSrc);
const RegisterBank *SgprRB = &RBI.getRegBank(AMDGPU::SGPRRegBankID);
if (Ty.getSizeInBits() == 32) {
- return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {{SgprRB, Ty}}, {VgprSrc})
- .getReg(0);
+ Register SgprDst = B.getMRI()->createVirtualRegister({SgprRB, Ty});
+ return BuildRL(B, SgprDst, VgprSrc).getReg(0);
}
SmallVector<Register, 8> SgprDstParts;
- unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);
+ unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI,
+ BuildRL);
return B.buildMergeLikeInstr({SgprRB, Ty}, SgprDstParts).getReg(0);
}
-void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst,
- Register VgprSrc, const RegisterBankInfo &RBI) {
+template <typename ReadLaneFnTy>
+static void buildReadLane(MachineIRBuilder &B, Register SgprDst,
+ Register VgprSrc, const RegisterBankInfo &RBI,
+ ReadLaneFnTy BuildReadLane) {
LLT Ty = B.getMRI()->getType(VgprSrc);
if (Ty.getSizeInBits() == 32) {
- B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc});
+ BuildReadLane(B, SgprDst, VgprSrc);
return;
}
SmallVector<Register, 8> SgprDstParts;
- unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);
+ unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI,
+ BuildReadLane);
B.buildMergeLikeInstr(SgprDst, SgprDstParts).getReg(0);
}
+
+void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst,
+ Register VgprSrc, const RegisterBankInfo &RBI) {
+ return buildReadLane(
+ B, SgprDst, VgprSrc, RBI,
+ [](MachineIRBuilder &B, Register SgprDst, Register VgprSrc) {
+ return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc});
+ });
+}
+
+void AMDGPU::buildReadFirstLane(MachineIRBuilder &B, Register SgprDst,
+ Register VgprSrc, const RegisterBankInfo &RBI) {
+ return buildReadLane(
+ B, SgprDst, VgprSrc, RBI,
+ [](MachineIRBuilder &B, Register SgprDst, Register VgprSrc) {
+ return B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, SgprDst)
+ .addReg(VgprSrc);
+ });
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
index 0c89bb5..5e1000e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
@@ -51,6 +51,8 @@ private:
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
const RegisterBankInfo &RBI);
+void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
+ const RegisterBankInfo &RBI);
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 00c7f0e..0ca2286 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1863,9 +1863,17 @@ bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
SIInstrFlags::FlatScratch);
}
-// If this matches zero_extend i32:x, return x
-static SDValue matchZExtFromI32(SDValue Op) {
- if (Op.getOpcode() != ISD::ZERO_EXTEND)
+// If this matches *_extend i32:x, return x
+// Otherwise if the value is I32 returns x.
+static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned,
+ const SelectionDAG *DAG) {
+ if (Op.getValueType() == MVT::i32)
+ return Op;
+
+ if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) &&
+ Op.getOpcode() != ISD::ANY_EXTEND &&
+ !(DAG->SignBitIsZero(Op) &&
+ Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND)))
return SDValue();
SDValue ExtSrc = Op.getOperand(0);
@@ -1873,12 +1881,13 @@ static SDValue matchZExtFromI32(SDValue Op) {
}
// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
-bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
- SDValue Addr,
- SDValue &SAddr,
- SDValue &VOffset,
- SDValue &Offset) const {
+// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset)
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
+ SDValue &SAddr, SDValue &VOffset,
+ SDValue &Offset, bool &ScaleOffset,
+ bool NeedIOffset) const {
int64_t ImmOffset = 0;
+ ScaleOffset = false;
// Match the immediate offset first, which canonically is moved as low as
// possible.
@@ -1888,7 +1897,8 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
- if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
+ if (NeedIOffset &&
+ TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
SIInstrFlags::FlatGlobal)) {
Addr = LHS;
ImmOffset = COffsetVal;
@@ -1898,11 +1908,14 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
// saddr + large_offset -> saddr +
// (voffset = large_offset & ~MaxOffset) +
// (large_offset & MaxOffset);
- int64_t SplitImmOffset, RemainderOffset;
- std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
- COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
+ int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal;
+ if (NeedIOffset) {
+ std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
+ COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
+ }
- if (isUInt<32>(RemainderOffset)) {
+ if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
+ : isUInt<32>(RemainderOffset)) {
SDNode *VMov = CurDAG->getMachineNode(
AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
@@ -1929,21 +1942,26 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
// Match the variable offset.
if (Addr.getOpcode() == ISD::ADD) {
LHS = Addr.getOperand(0);
- RHS = Addr.getOperand(1);
if (!LHS->isDivergent()) {
- // add (i64 sgpr), (zero_extend (i32 vgpr))
- if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
+ // add (i64 sgpr), (*_extend (i32 vgpr))
+ RHS = Addr.getOperand(1);
+ ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset());
+ if (SDValue ExtRHS = matchExtFromI32orI32(
+ RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
SAddr = LHS;
- VOffset = ZextRHS;
+ VOffset = ExtRHS;
}
}
+ RHS = Addr.getOperand(1);
if (!SAddr && !RHS->isDivergent()) {
- // add (zero_extend (i32 vgpr)), (i64 sgpr)
- if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
+ // add (*_extend (i32 vgpr)), (i64 sgpr)
+ ScaleOffset = SelectScaleOffset(N, LHS, Subtarget->hasSignedGVSOffset());
+ if (SDValue ExtLHS = matchExtFromI32orI32(
+ LHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
SAddr = RHS;
- VOffset = ZextLHS;
+ VOffset = ExtLHS;
}
}
@@ -1953,6 +1971,27 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
}
}
+ if (Subtarget->hasScaleOffset() &&
+ (Addr.getOpcode() == (Subtarget->hasSignedGVSOffset()
+ ? AMDGPUISD::MAD_I64_I32
+ : AMDGPUISD::MAD_U64_U32) ||
+ (Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 &&
+ CurDAG->SignBitIsZero(Addr.getOperand(0)))) &&
+ Addr.getOperand(0)->isDivergent() &&
+ isa<ConstantSDNode>(Addr.getOperand(1)) &&
+ !Addr.getOperand(2)->isDivergent()) {
+ // mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr)
+ unsigned Size =
+ (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
+ ScaleOffset = Addr.getConstantOperandVal(1) == Size;
+ if (ScaleOffset) {
+ SAddr = Addr.getOperand(2);
+ VOffset = Addr.getOperand(0);
+ Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
+ return true;
+ }
+ }
+
if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
isa<ConstantSDNode>(Addr))
return false;
@@ -1972,10 +2011,12 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
SDValue &SAddr, SDValue &VOffset,
SDValue &Offset,
SDValue &CPol) const {
- if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset))
+ bool ScaleOffset;
+ if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
return false;
- CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
+ CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
+ SDLoc(), MVT::i32);
return true;
}
@@ -1983,10 +2024,11 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
SDValue &SAddr, SDValue &VOffset,
SDValue &Offset,
SDValue &CPol) const {
- if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset))
+ bool ScaleOffset;
+ if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
return false;
- unsigned CPolVal = AMDGPU::CPol::GLC;
+ unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC;
CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32);
return true;
}
@@ -2074,7 +2116,8 @@ bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
SDValue &VAddr, SDValue &SAddr,
- SDValue &Offset) const {
+ SDValue &Offset,
+ SDValue &CPol) const {
int64_t ImmOffset = 0;
SDValue LHS, RHS;
@@ -2106,6 +2149,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
return false;
Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
+ CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
return true;
}
}
@@ -2139,6 +2183,10 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
return false;
SAddr = SelectSAddrFI(CurDAG, SAddr);
Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
+
+ bool ScaleOffset = SelectScaleOffset(N, VAddr, true /* IsSigned */);
+ CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
+ SDLoc(), MVT::i32);
return true;
}
@@ -2159,17 +2207,59 @@ bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
return true;
}
+// Given \p Offset and load node \p N check if an \p Offset is a multiple of
+// the load byte size. If it is update \p Offset to a pre-scaled value and
+// return true.
+bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset,
+ bool IsSigned) const {
+ bool ScaleOffset = false;
+ if (!Subtarget->hasScaleOffset() || !Offset)
+ return false;
+
+ unsigned Size =
+ (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
+
+ SDValue Off = Offset;
+ if (SDValue Ext = matchExtFromI32orI32(Offset, IsSigned, CurDAG))
+ Off = Ext;
+
+ if (isPowerOf2_32(Size) && Off.getOpcode() == ISD::SHL) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Off.getOperand(1)))
+ ScaleOffset = C->getZExtValue() == Log2_32(Size);
+ } else if (Offset.getOpcode() == ISD::MUL ||
+ (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) ||
+ Offset.getOpcode() == AMDGPUISD::MUL_U24 ||
+ (Offset.isMachineOpcode() &&
+ Offset.getMachineOpcode() ==
+ (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO
+ : AMDGPU::S_MUL_U64_U32_PSEUDO))) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Offset.getOperand(1)))
+ ScaleOffset = C->getZExtValue() == Size;
+ }
+
+ if (ScaleOffset)
+ Offset = Off.getOperand(0);
+
+ return ScaleOffset;
+}
+
// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
// not null) offset. If Imm32Only is true, match only 32-bit immediate
// offsets available on CI.
-bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
+bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode,
SDValue *SOffset, SDValue *Offset,
bool Imm32Only, bool IsBuffer,
- bool HasSOffset,
- int64_t ImmOffset) const {
+ bool HasSOffset, int64_t ImmOffset,
+ bool *ScaleOffset) const {
assert((!SOffset || !Offset) &&
"Cannot match both soffset and offset at the same time!");
+ if (ScaleOffset) {
+ assert(N && SOffset);
+
+ *ScaleOffset = SelectScaleOffset(N, ByteOffsetNode, false /* IsSigned */);
+ }
+
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
if (!C) {
if (!SOffset)
@@ -2254,24 +2344,25 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
// Match a base and an immediate (if Offset is not null) or an SGPR (if
// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
// true, match only 32-bit immediate offsets available on CI.
-bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
- SDValue *SOffset, SDValue *Offset,
- bool Imm32Only, bool IsBuffer,
- bool HasSOffset,
- int64_t ImmOffset) const {
+bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr,
+ SDValue &SBase, SDValue *SOffset,
+ SDValue *Offset, bool Imm32Only,
+ bool IsBuffer, bool HasSOffset,
+ int64_t ImmOffset,
+ bool *ScaleOffset) const {
if (SOffset && Offset) {
assert(!Imm32Only && !IsBuffer);
SDValue B;
- if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
+ if (!SelectSMRDBaseOffset(N, Addr, B, nullptr, Offset, false, false, true))
return false;
int64_t ImmOff = 0;
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
ImmOff = C->getSExtValue();
- return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
- ImmOff);
+ return SelectSMRDBaseOffset(N, B, SBase, SOffset, nullptr, false, false,
+ true, ImmOff, ScaleOffset);
}
// A 32-bit (address + offset) should not cause unsigned 32-bit integer
@@ -2291,23 +2382,25 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
if (!N0 || !N1)
return false;
- if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
- ImmOffset)) {
+ if (SelectSMRDOffset(N, N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
+ ImmOffset, ScaleOffset)) {
SBase = N0;
return true;
}
- if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
- ImmOffset)) {
+ if (SelectSMRDOffset(N, N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
+ ImmOffset, ScaleOffset)) {
SBase = N1;
return true;
}
return false;
}
-bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
+bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase,
SDValue *SOffset, SDValue *Offset,
- bool Imm32Only) const {
- if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
+ bool Imm32Only, bool *ScaleOffset) const {
+ if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only,
+ /* IsBuffer */ false, /* HasSOffset */ false,
+ /* ImmOffset */ 0, ScaleOffset)) {
SBase = Expand32BitAddress(SBase);
return true;
}
@@ -2323,36 +2416,51 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
SDValue &Offset) const {
- return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
+ return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
+ &Offset);
}
bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
SDValue &Offset) const {
assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
- return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
- /* Imm32Only */ true);
+ return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
+ &Offset, /* Imm32Only */ true);
}
-bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
- SDValue &SOffset) const {
- return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
+bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase,
+ SDValue &SOffset, SDValue &CPol) const {
+ bool ScaleOffset;
+ if (!SelectSMRD(N, Addr, SBase, &SOffset, /* Offset */ nullptr,
+ /* Imm32Only */ false, &ScaleOffset))
+ return false;
+
+ CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
+ SDLoc(N), MVT::i32);
+ return true;
}
-bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
- SDValue &SOffset,
- SDValue &Offset) const {
- return SelectSMRD(Addr, SBase, &SOffset, &Offset);
+bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr,
+ SDValue &SBase, SDValue &SOffset,
+ SDValue &Offset,
+ SDValue &CPol) const {
+ bool ScaleOffset;
+ if (!SelectSMRD(N, Addr, SBase, &SOffset, &Offset, false, &ScaleOffset))
+ return false;
+
+ CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
+ SDLoc(N), MVT::i32);
+ return true;
}
bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
- return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
+ return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
/* Imm32Only */ false, /* IsBuffer */ true);
}
bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
SDValue &Offset) const {
assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
- return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
+ return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
/* Imm32Only */ true, /* IsBuffer */ true);
}
@@ -2361,9 +2469,9 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
// Match the (soffset + offset) pair as a 32-bit register base and
// an immediate offset.
return N.getValueType() == MVT::i32 &&
- SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
- &Offset, /* Imm32Only */ false,
- /* IsBuffer */ true);
+ SelectSMRDBaseOffset(/* N */ nullptr, N, /* SBase */ SOffset,
+ /* SOffset*/ nullptr, &Offset,
+ /* Imm32Only */ false, /* IsBuffer */ true);
}
bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
@@ -3753,58 +3861,114 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
return SelectVOP3Mods(In, Src, SrcMods);
}
+// Match lowered fpext from bf16 to f32. This is a bit operation extending
+// a 16-bit value with 16-bit of zeroes at LSB:
+//
+// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val)))))
+// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true
+// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false
+static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) {
+ if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST)
+ return SDValue();
+ Op = Op.getOperand(0);
+
+ IsExtractHigh = false;
+ if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) {
+ auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0));
+ if (!Low16 || !Low16->isZero())
+ return SDValue();
+ Op = stripBitcast(Op.getOperand(1));
+ if (Op.getValueType() != MVT::bf16)
+ return SDValue();
+ return Op;
+ }
+
+ if (Op.getValueType() != MVT::i32)
+ return SDValue();
+
+ if (Op.getOpcode() == ISD::AND) {
+ if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ if (Mask->getZExtValue() == 0xffff0000) {
+ IsExtractHigh = true;
+ return Op.getOperand(0);
+ }
+ }
+ return SDValue();
+ }
+
+ if (Op.getOpcode() == ISD::SHL) {
+ if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ if (Amt->getZExtValue() == 16)
+ return Op.getOperand(0);
+ }
+ }
+
+ return SDValue();
+}
+
// The return value is not whether the match is possible (which it always is),
// but whether or not it a conversion is really used.
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
- unsigned &Mods) const {
+ unsigned &Mods,
+ MVT VT) const {
Mods = 0;
SelectVOP3ModsImpl(In, Src, Mods);
+ bool IsExtractHigh = false;
if (Src.getOpcode() == ISD::FP_EXTEND) {
Src = Src.getOperand(0);
- assert(Src.getValueType() == MVT::f16);
- Src = stripBitcast(Src);
+ } else if (VT == MVT::bf16) {
+ SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh);
+ if (!B16)
+ return false;
+ Src = B16;
+ } else
+ return false;
- // Be careful about folding modifiers if we already have an abs. fneg is
- // applied last, so we don't want to apply an earlier fneg.
- if ((Mods & SISrcMods::ABS) == 0) {
- unsigned ModsTmp;
- SelectVOP3ModsImpl(Src, Src, ModsTmp);
+ if (Src.getValueType() != VT &&
+ (VT != MVT::bf16 || Src.getValueType() != MVT::i32))
+ return false;
- if ((ModsTmp & SISrcMods::NEG) != 0)
- Mods ^= SISrcMods::NEG;
+ Src = stripBitcast(Src);
- if ((ModsTmp & SISrcMods::ABS) != 0)
- Mods |= SISrcMods::ABS;
- }
+ // Be careful about folding modifiers if we already have an abs. fneg is
+ // applied last, so we don't want to apply an earlier fneg.
+ if ((Mods & SISrcMods::ABS) == 0) {
+ unsigned ModsTmp;
+ SelectVOP3ModsImpl(Src, Src, ModsTmp);
- // op_sel/op_sel_hi decide the source type and source.
- // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
- // If the sources's op_sel is set, it picks the high half of the source
- // register.
+ if ((ModsTmp & SISrcMods::NEG) != 0)
+ Mods ^= SISrcMods::NEG;
- Mods |= SISrcMods::OP_SEL_1;
- if (isExtractHiElt(Src, Src)) {
- Mods |= SISrcMods::OP_SEL_0;
+ if ((ModsTmp & SISrcMods::ABS) != 0)
+ Mods |= SISrcMods::ABS;
+ }
- // TODO: Should we try to look for neg/abs here?
- }
+ // op_sel/op_sel_hi decide the source type and source.
+ // If the source's op_sel_hi is set, it indicates to do a conversion from
+ // fp16. If the sources's op_sel is set, it picks the high half of the source
+ // register.
- // Prevent unnecessary subreg COPY to VGPR_16
- if (Src.getOpcode() == ISD::TRUNCATE &&
- Src.getOperand(0).getValueType() == MVT::i32) {
- Src = Src.getOperand(0);
- }
- return true;
+ Mods |= SISrcMods::OP_SEL_1;
+ if (IsExtractHigh ||
+ (Src.getValueSizeInBits() == 16 && isExtractHiElt(Src, Src))) {
+ Mods |= SISrcMods::OP_SEL_0;
+
+ // TODO: Should we try to look for neg/abs here?
}
- return false;
+ // Prevent unnecessary subreg COPY to VGPR_16
+ if (Src.getOpcode() == ISD::TRUNCATE &&
+ Src.getOperand(0).getValueType() == MVT::i32) {
+ Src = Src.getOperand(0);
+ }
+ return true;
}
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods = 0;
- if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
+ if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16))
return false;
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
@@ -3813,7 +3977,24 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods = 0;
- SelectVOP3PMadMixModsImpl(In, Src, Mods);
+ SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16);
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ unsigned Mods = 0;
+ if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16))
+ return false;
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ unsigned Mods = 0;
+ SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16);
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index acbab3d..a6ce745 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -19,6 +19,7 @@
#include "SIModeRegisterDefaults.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Target/TargetMachine.h"
namespace llvm {
@@ -162,7 +163,8 @@ private:
bool SelectScratchOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &Offset) const;
bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
- SDValue &VOffset, SDValue &Offset) const;
+ SDValue &VOffset, SDValue &Offset, bool &ScaleOffset,
+ bool NeedIOffset = true) const;
bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;
@@ -174,24 +176,31 @@ private:
bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr,
uint64_t ImmOffset) const;
bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr,
- SDValue &SAddr, SDValue &Offset) const;
+ SDValue &SAddr, SDValue &Offset,
+ SDValue &CPol) const;
- bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset,
+ bool SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode, SDValue *SOffset,
SDValue *Offset, bool Imm32Only = false,
bool IsBuffer = false, bool HasSOffset = false,
- int64_t ImmOffset = 0) const;
+ int64_t ImmOffset = 0,
+ bool *ScaleOffset = nullptr) const;
SDValue Expand32BitAddress(SDValue Addr) const;
- bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset,
- SDValue *Offset, bool Imm32Only = false,
- bool IsBuffer = false, bool HasSOffset = false,
- int64_t ImmOffset = 0) const;
- bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset,
- SDValue *Offset, bool Imm32Only = false) const;
+ bool SelectSMRDBaseOffset(SDNode *N, SDValue Addr, SDValue &SBase,
+ SDValue *SOffset, SDValue *Offset,
+ bool Imm32Only = false, bool IsBuffer = false,
+ bool HasSOffset = false, int64_t ImmOffset = 0,
+ bool *ScaleOffset = nullptr) const;
+ bool SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase, SDValue *SOffset,
+ SDValue *Offset, bool Imm32Only = false,
+ bool *ScaleOffset = nullptr) const;
bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
- bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &SOffset) const;
- bool SelectSMRDSgprImm(SDValue Addr, SDValue &SBase, SDValue &SOffset,
- SDValue &Offset) const;
+ bool SelectScaleOffset(SDNode *N, SDValue &Offset, bool IsSigned) const;
+ bool SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase, SDValue &SOffset,
+ SDValue &CPol) const;
+ bool SelectSMRDSgprImm(SDNode *N, SDValue Addr, SDValue &SBase,
+ SDValue &SOffset, SDValue &Offset,
+ SDValue &CPol) const;
bool SelectSMRDBufferImm(SDValue N, SDValue &Offset) const;
bool SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const;
bool SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
@@ -246,11 +255,15 @@ private:
bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
- bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
- unsigned &Mods) const;
+ bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods,
+ MVT VT) const;
bool SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
SDValue &SrcMods) const;
bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const;
+ bool SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const;
bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
SDValue &Tbl) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 8975486..8ca9a97 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3494,25 +3494,74 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
}
/// Match a zero extend from a 32-bit value to 64-bits.
-static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
+Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
Register ZExtSrc;
- if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
- return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
+ if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
+ return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
// Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
- const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
+ const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
return Register();
assert(Def->getNumOperands() == 3 &&
- MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
- if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
+ MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
+ if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
return Def->getOperand(1).getReg();
}
return Register();
}
+/// Match a sign extend from a 32-bit value to 64-bits.
+Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
+ Register SExtSrc;
+ if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
+ return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
+
+ // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
+ const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
+ if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
+ return Register();
+
+ assert(Def->getNumOperands() == 3 &&
+ MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
+ if (mi_match(Def->getOperand(2).getReg(), *MRI,
+ m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
+ m_SpecificICst(31))))
+ return Def->getOperand(1).getReg();
+
+ if (VT->signBitIsZero(Reg))
+ return matchZeroExtendFromS32(Reg);
+
+ return Register();
+}
+
+/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
+/// is 32-bit.
+Register
+AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
+ return MRI->getType(Reg) == LLT::scalar(32) ? Reg
+ : matchZeroExtendFromS32(Reg);
+}
+
+/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
+/// is 32-bit.
+Register
+AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
+ return MRI->getType(Reg) == LLT::scalar(32) ? Reg
+ : matchSignExtendFromS32(Reg);
+}
+
+Register
+AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
+ bool IsSigned) const {
+ if (IsSigned)
+ return matchSignExtendFromS32OrS32(Reg);
+
+ return matchZeroExtendFromS32OrS32(Reg);
+}
+
Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
Register AnyExtSrc;
if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
@@ -3581,7 +3630,7 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
if (isSGPR(SAddr)) {
Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
- if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
+ if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
Addr = SAddr;
VOffset = Off;
}
@@ -5223,7 +5272,7 @@ AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
unsigned Key = 0;
- Register S32 = matchZeroExtendFromS32(*MRI, Src);
+ Register S32 = matchZeroExtendFromS32(Src);
if (!S32)
S32 = matchAnyExtendFromS32(Src);
@@ -5296,10 +5345,68 @@ AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
}};
}
+// Given \p Offset and load specified by the \p Root operand check if \p Offset
+// is a multiple of the load byte size. If it is update \p Offset to a
+// pre-scaled value and return true.
+bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
+ Register &Offset,
+ bool IsSigned) const {
+ if (!Subtarget->hasScaleOffset())
+ return false;
+
+ const MachineInstr &MI = *Root.getParent();
+ MachineMemOperand *MMO = *MI.memoperands_begin();
+
+ if (!MMO->getSize().hasValue())
+ return false;
+
+ uint64_t Size = MMO->getSize().getValue();
+
+ Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
+ if (!OffsetReg)
+ OffsetReg = Offset;
+
+ if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
+ OffsetReg = Def->Reg;
+
+ Register Op0;
+ MachineInstr *Mul;
+ bool ScaleOffset =
+ (isPowerOf2_64(Size) &&
+ mi_match(OffsetReg, *MRI,
+ m_GShl(m_Reg(Op0),
+ m_any_of(m_SpecificICst(Log2_64(Size)),
+ m_Copy(m_SpecificICst(Log2_64(Size))))))) ||
+ mi_match(OffsetReg, *MRI,
+ m_GMul(m_Reg(Op0), m_any_of(m_SpecificICst(Size),
+ m_Copy(m_SpecificICst(Size))))) ||
+ mi_match(
+ OffsetReg, *MRI,
+ m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
+ m_Reg(Op0), m_SpecificICst(Size))) ||
+ // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
+ (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
+ (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
+ : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
+ (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
+ VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
+ mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
+ mi_match(Mul->getOperand(3).getReg(), *MRI,
+ m_GTrunc(m_any_of(m_SpecificICst(Size),
+ m_Copy(m_SpecificICst(Size))))) &&
+ mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
+
+ if (ScaleOffset)
+ Offset = Op0;
+
+ return ScaleOffset;
+}
+
bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
Register &Base,
Register *SOffset,
- int64_t *Offset) const {
+ int64_t *Offset,
+ bool *ScaleOffset) const {
MachineInstr *MI = Root.getParent();
MachineBasicBlock *MBB = MI->getParent();
@@ -5314,6 +5421,9 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
const GEPInfo &GEPI = AddrInfo[0];
std::optional<int64_t> EncodedImm;
+ if (ScaleOffset)
+ *ScaleOffset = false;
+
if (SOffset && Offset) {
EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
/*HasSOffset=*/true);
@@ -5321,8 +5431,12 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
AddrInfo.size() > 1) {
const GEPInfo &GEPI2 = AddrInfo[1];
if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
- if (Register OffsetReg =
- matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
+ Register OffsetReg = GEPI2.SgprParts[1];
+ if (ScaleOffset)
+ *ScaleOffset =
+ selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
+ OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
+ if (OffsetReg) {
Base = GEPI2.SgprParts[0];
*SOffset = OffsetReg;
*Offset = *EncodedImm;
@@ -5367,7 +5481,11 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
}
if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
- if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
+ Register OffsetReg = GEPI.SgprParts[1];
+ if (ScaleOffset)
+ *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
+ OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
+ if (OffsetReg) {
Base = GEPI.SgprParts[0];
*SOffset = OffsetReg;
return true;
@@ -5381,7 +5499,8 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
Register Base;
int64_t Offset;
- if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
+ if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
+ /* ScaleOffset */ nullptr))
return std::nullopt;
return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
@@ -5412,23 +5531,30 @@ AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
Register Base, SOffset;
- if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
+ bool ScaleOffset;
+ if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
+ &ScaleOffset))
return std::nullopt;
+ unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
- [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
}
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
Register Base, SOffset;
int64_t Offset;
- if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
+ bool ScaleOffset;
+ if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
return std::nullopt;
+ unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
}
std::pair<Register, int>
@@ -5490,7 +5616,8 @@ AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
- unsigned CPolBits) const {
+ unsigned CPolBits,
+ bool NeedIOffset) const {
Register Addr = Root.getReg();
Register PtrBase;
int64_t ConstOffset;
@@ -5501,7 +5628,8 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
if (ConstOffset != 0) {
- if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
+ if (NeedIOffset &&
+ TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
SIInstrFlags::FlatGlobal)) {
Addr = PtrBase;
ImmOffset = ConstOffset;
@@ -5514,11 +5642,15 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
// saddr + large_offset -> saddr +
// (voffset = large_offset & ~MaxOffset) +
// (large_offset & MaxOffset);
- int64_t SplitImmOffset, RemainderOffset;
- std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
- ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
+ int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
+ if (NeedIOffset) {
+ std::tie(SplitImmOffset, RemainderOffset) =
+ TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
+ SIInstrFlags::FlatGlobal);
+ }
- if (isUInt<32>(RemainderOffset)) {
+ if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
+ : isUInt<32>(RemainderOffset)) {
MachineInstr *MI = Root.getParent();
MachineBasicBlock *MBB = MI->getParent();
Register HighBits =
@@ -5528,12 +5660,22 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
HighBits)
.addImm(RemainderOffset);
+ if (NeedIOffset)
+ return {{
+ [=](MachineInstrBuilder &MIB) {
+ MIB.addReg(PtrBase);
+ }, // saddr
+ [=](MachineInstrBuilder &MIB) {
+ MIB.addReg(HighBits);
+ }, // voffset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
+ }};
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
[=](MachineInstrBuilder &MIB) {
MIB.addReg(HighBits);
}, // voffset
- [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
}};
}
@@ -5565,18 +5707,33 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
// It's possible voffset is an SGPR here, but the copy to VGPR will be
// inserted later.
- if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
+ bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
+ Subtarget->hasSignedGVSOffset());
+ if (Register VOffset = matchExtendFromS32OrS32(
+ PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
+ if (NeedIOffset)
+ return {{[=](MachineInstrBuilder &MIB) { // saddr
+ MIB.addReg(SAddr);
+ },
+ [=](MachineInstrBuilder &MIB) { // voffset
+ MIB.addReg(VOffset);
+ },
+ [=](MachineInstrBuilder &MIB) { // offset
+ MIB.addImm(ImmOffset);
+ },
+ [=](MachineInstrBuilder &MIB) { // cpol
+ MIB.addImm(CPolBits |
+ (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
+ }}};
return {{[=](MachineInstrBuilder &MIB) { // saddr
MIB.addReg(SAddr);
},
[=](MachineInstrBuilder &MIB) { // voffset
MIB.addReg(VOffset);
},
- [=](MachineInstrBuilder &MIB) { // offset
- MIB.addImm(ImmOffset);
- },
[=](MachineInstrBuilder &MIB) { // cpol
- MIB.addImm(CPolBits);
+ MIB.addImm(CPolBits |
+ (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
}}};
}
}
@@ -5597,10 +5754,16 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
.addImm(0);
+ if (NeedIOffset)
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
+ }};
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
[=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
- [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
[=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
}};
}
@@ -5732,22 +5895,32 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
return std::nullopt;
+ unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */)
+ ? AMDGPU::CPol::SCAL
+ : 0;
+
if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
int FI = LHSDef->MI->getOperand(1).getIndex();
return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
[=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
- [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
}};
}
if (!isSGPR(LHS))
+ if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI))
+ LHS = Def->Reg;
+
+ if (!isSGPR(LHS))
return std::nullopt;
return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
- [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
- [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
}};
}
@@ -6895,6 +7068,17 @@ void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
}
+void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ uint32_t V = MI.getOperand(2).getImm();
+ V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK))
+ << AMDGPU::CPol::SCOPE_SHIFT;
+ if (!Subtarget->hasSafeCUPrefetch())
+ V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
+ MIB.addImm(V);
+}
+
/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 34bdf0a..61d9de1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -232,8 +232,10 @@ private:
InstructionSelector::ComplexRendererFns
selectVINTERPModsHi(MachineOperand &Root) const;
+ bool selectScaleOffset(MachineOperand &Root, Register &Offset,
+ bool IsSigned) const;
bool selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset,
- int64_t *Offset) const;
+ int64_t *Offset, bool *ScaleOffset) const;
InstructionSelector::ComplexRendererFns
selectSmrdImm(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
@@ -254,7 +256,8 @@ private:
selectScratchOffset(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
- selectGlobalSAddr(MachineOperand &Root, unsigned CPolBits) const;
+ selectGlobalSAddr(MachineOperand &Root, unsigned CPolBits,
+ bool NeedIOffset = true) const;
InstructionSelector::ComplexRendererFns
selectGlobalSAddr(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
@@ -411,6 +414,10 @@ private:
void renderRoundMode(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
+
+ void renderPrefetchLoc(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const;
+
void renderScaledMAIIntrinsicOperand(MachineInstrBuilder &MIB,
const MachineInstr &MI, int OpIdx) const;
@@ -421,6 +428,19 @@ private:
// shift amount operand's `ShAmtBits` bits is unneeded.
bool isUnneededShiftMask(const MachineInstr &MI, unsigned ShAmtBits) const;
+ /// Match a zero extend from a 32-bit value to 64-bits.
+ Register matchZeroExtendFromS32(Register Reg) const;
+ /// Match a sign extend from a 32-bit value to 64-bits.
+ Register matchSignExtendFromS32(Register Reg) const;
+ /// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
+ /// is 32-bit.
+ Register matchZeroExtendFromS32OrS32(Register Reg) const;
+ /// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
+ /// is 32-bit.
+ Register matchSignExtendFromS32OrS32(Register Reg) const;
+ /// Match either sign or zero extend depending on the \p IsSigned from a
+ /// 32-bit value to 64-bits, or \p Reg itself if it is 32-bit.
+ Register matchExtendFromS32OrS32(Register Reg, bool IsSigned) const;
/// Match an any extend from a 32-bit value to 64-bit.
Register matchAnyExtendFromS32(Register Reg) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e7bf88d..fedfa3f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4208,6 +4208,9 @@ bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
assert(Ty.isScalar());
unsigned Size = Ty.getSizeInBits();
+ if (ST.hasVectorMulU64() && Size == 64)
+ return true;
+
unsigned NumParts = Size / 32;
assert((Size % 32) == 0);
assert(NumParts >= 2);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
index ba66134..e187959 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -23,6 +23,8 @@
#include "GCNSubtarget.h"
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -115,126 +117,233 @@ public:
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {};
- bool isLaneMask(Register Reg) {
- const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
- if (RB && RB->getID() == AMDGPU::VCCRegBankID)
- return true;
+ bool isLaneMask(Register Reg);
+ std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode);
+ std::pair<GUnmerge *, int> tryMatchRALFromUnmerge(Register Src);
+ Register getReadAnyLaneSrc(Register Src);
+ void replaceRegWithOrBuildCopy(Register Dst, Register Src);
- const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
- return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
- }
+ bool tryEliminateReadAnyLane(MachineInstr &Copy);
+ void tryCombineCopy(MachineInstr &MI);
+ void tryCombineS1AnyExt(MachineInstr &MI);
+};
- void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) {
- MI.eraseFromParent();
- if (Optional0 && isTriviallyDead(*Optional0, MRI))
- Optional0->eraseFromParent();
- }
+bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) {
+ const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
+ if (RB && RB->getID() == AMDGPU::VCCRegBankID)
+ return true;
- std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode) {
- MachineInstr *MatchMI = MRI.getVRegDef(Src);
- if (MatchMI->getOpcode() != Opcode)
- return {nullptr, Register()};
- return {MatchMI, MatchMI->getOperand(1).getReg()};
- }
+ const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
+ return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
+}
- void tryCombineCopy(MachineInstr &MI) {
- Register Dst = MI.getOperand(0).getReg();
- Register Src = MI.getOperand(1).getReg();
- // Skip copies of physical registers.
- if (!Dst.isVirtual() || !Src.isVirtual())
- return;
-
- // This is a cross bank copy, sgpr S1 to lane mask.
- //
- // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32)
- // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1)
- // ->
- // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32)
- if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) {
- auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC);
- assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
- "sgpr S1 must be result of G_TRUNC of sgpr S32");
-
- B.setInstr(MI);
- // Ensure that truncated bits in BoolSrc are 0.
- auto One = B.buildConstant({SgprRB, S32}, 1);
- auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
- B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+std::pair<MachineInstr *, Register>
+AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) {
+ MachineInstr *MatchMI = MRI.getVRegDef(Src);
+ if (MatchMI->getOpcode() != Opcode)
+ return {nullptr, Register()};
+ return {MatchMI, MatchMI->getOperand(1).getReg()};
+}
+
+std::pair<GUnmerge *, int>
+AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) {
+ MachineInstr *ReadAnyLane = MRI.getVRegDef(Src);
+ if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE)
+ return {nullptr, -1};
+
+ Register RALSrc = ReadAnyLane->getOperand(1).getReg();
+ if (auto *UnMerge = getOpcodeDef<GUnmerge>(RALSrc, MRI))
+ return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)};
- // Src = G_AMDGPU_READANYLANE RALSrc
- // Dst = COPY Src
- // ->
- // Dst = RALSrc
- if (MRI.getRegBankOrNull(Dst) == VgprRB &&
- MRI.getRegBankOrNull(Src) == SgprRB) {
- auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
- if (!RAL)
- return;
-
- assert(MRI.getRegBank(RALSrc) == VgprRB);
- MRI.replaceRegWith(Dst, RALSrc);
- cleanUpAfterCombine(MI, RAL);
- return;
+ return {nullptr, -1};
+}
+
+Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
+ // Src = G_AMDGPU_READANYLANE RALSrc
+ auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
+ if (RAL)
+ return RALSrc;
+
+ // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
+ // LoSgpr = G_AMDGPU_READANYLANE LoVgpr
+ // HiSgpr = G_AMDGPU_READANYLANE HiVgpr
+ // Src G_MERGE_VALUES LoSgpr, HiSgpr
+ auto *Merge = getOpcodeDef<GMergeLikeInstr>(Src, MRI);
+ if (Merge) {
+ unsigned NumElts = Merge->getNumSources();
+ auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0));
+ if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0)
+ return {};
+
+ // Check if all elements are from same unmerge and there is no shuffling.
+ for (unsigned i = 1; i < NumElts; ++i) {
+ auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i));
+ if (UnmergeI != Unmerge || (unsigned)IdxI != i)
+ return {};
}
+ return Unmerge->getSourceReg();
}
- void tryCombineS1AnyExt(MachineInstr &MI) {
- // %Src:sgpr(S1) = G_TRUNC %TruncSrc
- // %Dst = G_ANYEXT %Src:sgpr(S1)
- // ->
- // %Dst = G_... %TruncSrc
- Register Dst = MI.getOperand(0).getReg();
- Register Src = MI.getOperand(1).getReg();
- if (MRI.getType(Src) != S1)
- return;
-
- auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC);
- if (!Trunc)
- return;
-
- LLT DstTy = MRI.getType(Dst);
- LLT TruncSrcTy = MRI.getType(TruncSrc);
-
- if (DstTy == TruncSrcTy) {
- MRI.replaceRegWith(Dst, TruncSrc);
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+ // SrcRegIdx = G_AMDGPU_READANYLANE RALElSrc
+ // SourceReg G_MERGE_VALUES ..., SrcRegIdx, ...
+ // ..., Src, ... = G_UNMERGE_VALUES SourceReg
+ auto *UnMerge = getOpcodeDef<GUnmerge>(Src, MRI);
+ if (!UnMerge)
+ return {};
+
+ int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr);
+ Merge = getOpcodeDef<GMergeLikeInstr>(UnMerge->getSourceReg(), MRI);
+ if (!Merge || UnMerge->getNumDefs() != Merge->getNumSources())
+ return {};
+
+ Register SrcRegIdx = Merge->getSourceReg(Idx);
+ if (MRI.getType(Src) != MRI.getType(SrcRegIdx))
+ return {};
+
+ auto [RALEl, RALElSrc] = tryMatch(SrcRegIdx, AMDGPU::G_AMDGPU_READANYLANE);
+ if (RALEl)
+ return RALElSrc;
+
+ return {};
+}
+
+void AMDGPURegBankLegalizeCombiner::replaceRegWithOrBuildCopy(Register Dst,
+ Register Src) {
+ if (Dst.isVirtual())
+ MRI.replaceRegWith(Dst, Src);
+ else
+ B.buildCopy(Dst, Src);
+}
+
+bool AMDGPURegBankLegalizeCombiner::tryEliminateReadAnyLane(
+ MachineInstr &Copy) {
+ Register Dst = Copy.getOperand(0).getReg();
+ Register Src = Copy.getOperand(1).getReg();
+
+ // Skip non-vgpr Dst
+ if (Dst.isVirtual() ? (MRI.getRegBankOrNull(Dst) != VgprRB)
+ : !TRI.isVGPR(MRI, Dst))
+ return false;
+
+ // Skip physical source registers and source registers with register class
+ if (!Src.isVirtual() || MRI.getRegClassOrNull(Src))
+ return false;
+
+ Register RALDst = Src;
+ MachineInstr &SrcMI = *MRI.getVRegDef(Src);
+ if (SrcMI.getOpcode() == AMDGPU::G_BITCAST)
+ RALDst = SrcMI.getOperand(1).getReg();
+
+ Register RALSrc = getReadAnyLaneSrc(RALDst);
+ if (!RALSrc)
+ return false;
+
+ B.setInstr(Copy);
+ if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) {
+ // Src = READANYLANE RALSrc Src = READANYLANE RALSrc
+ // Dst = Copy Src $Dst = Copy Src
+ // -> ->
+ // Dst = RALSrc $Dst = Copy RALSrc
+ replaceRegWithOrBuildCopy(Dst, RALSrc);
+ } else {
+ // RALDst = READANYLANE RALSrc RALDst = READANYLANE RALSrc
+ // Src = G_BITCAST RALDst Src = G_BITCAST RALDst
+ // Dst = Copy Src Dst = Copy Src
+ // -> ->
+ // NewVgpr = G_BITCAST RALDst NewVgpr = G_BITCAST RALDst
+ // Dst = NewVgpr $Dst = Copy NewVgpr
+ auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc);
+ replaceRegWithOrBuildCopy(Dst, Bitcast.getReg(0));
+ }
+
+ eraseInstr(Copy, MRI);
+ return true;
+}
+
+void AMDGPURegBankLegalizeCombiner::tryCombineCopy(MachineInstr &MI) {
+ if (tryEliminateReadAnyLane(MI))
+ return;
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ // Skip copies of physical registers.
+ if (!Dst.isVirtual() || !Src.isVirtual())
+ return;
+
+ // This is a cross bank copy, sgpr S1 to lane mask.
+ //
+ // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32)
+ // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1)
+ // ->
+ // %BoolSrc:sgpr(s32) = G_AND %TruncS32Src:sgpr(s32), 1
+ // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %BoolSrc:sgpr(s32)
+ if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) {
+ auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC);
+ assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
+ "sgpr S1 must be result of G_TRUNC of sgpr S32");
B.setInstr(MI);
+ // Ensure that truncated bits in BoolSrc are 0.
+ auto One = B.buildConstant({SgprRB, S32}, 1);
+ auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
+ B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
+ eraseInstr(MI, MRI);
+ }
+}
- if (DstTy == S32 && TruncSrcTy == S64) {
- auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc);
- MRI.replaceRegWith(Dst, Unmerge.getReg(0));
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+void AMDGPURegBankLegalizeCombiner::tryCombineS1AnyExt(MachineInstr &MI) {
+ // %Src:sgpr(S1) = G_TRUNC %TruncSrc
+ // %Dst = G_ANYEXT %Src:sgpr(S1)
+ // ->
+ // %Dst = G_... %TruncSrc
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ if (MRI.getType(Src) != S1)
+ return;
+
+ auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC);
+ if (!Trunc)
+ return;
+
+ LLT DstTy = MRI.getType(Dst);
+ LLT TruncSrcTy = MRI.getType(TruncSrc);
+
+ if (DstTy == TruncSrcTy) {
+ MRI.replaceRegWith(Dst, TruncSrc);
+ eraseInstr(MI, MRI);
+ return;
+ }
- if (DstTy == S64 && TruncSrcTy == S32) {
- B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
- {TruncSrc, B.buildUndef({SgprRB, S32})});
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+ B.setInstr(MI);
- if (DstTy == S32 && TruncSrcTy == S16) {
- B.buildAnyExt(Dst, TruncSrc);
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+ if (DstTy == S32 && TruncSrcTy == S64) {
+ auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc);
+ MRI.replaceRegWith(Dst, Unmerge.getReg(0));
+ eraseInstr(MI, MRI);
+ return;
+ }
- if (DstTy == S16 && TruncSrcTy == S32) {
- B.buildTrunc(Dst, TruncSrc);
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+ if (DstTy == S64 && TruncSrcTy == S32) {
+ B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
+ {TruncSrc, B.buildUndef({SgprRB, S32})});
+ eraseInstr(MI, MRI);
+ return;
+ }
- llvm_unreachable("missing anyext + trunc combine");
+ if (DstTy == S32 && TruncSrcTy == S16) {
+ B.buildAnyExt(Dst, TruncSrc);
+ eraseInstr(MI, MRI);
+ return;
}
-};
+
+ if (DstTy == S16 && TruncSrcTy == S32) {
+ B.buildTrunc(Dst, TruncSrc);
+ eraseInstr(MI, MRI);
+ return;
+ }
+
+ llvm_unreachable("missing anyext + trunc combine");
+}
// Search through MRI for virtual registers with sgpr register bank and S1 LLT.
[[maybe_unused]] static Register getAnySgprS1(const MachineRegisterInfo &MRI) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 411159c..f471881 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -33,7 +33,7 @@ RegBankLegalizeHelper::RegBankLegalizeHelper(
MachineIRBuilder &B, const MachineUniformityInfo &MUI,
const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
: ST(B.getMF().getSubtarget<GCNSubtarget>()), B(B), MRI(*B.getMRI()),
- MUI(MUI), RBI(RBI), RBLRules(RBLRules),
+ MUI(MUI), RBI(RBI), RBLRules(RBLRules), IsWave32(ST.isWave32()),
SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
@@ -56,6 +56,224 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
lower(MI, Mapping, WaterfallSgprs);
}
+bool RegBankLegalizeHelper::executeInWaterfallLoop(
+ MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
+ SmallSet<Register, 4> &SGPROperandRegs) {
+ // Track use registers which have already been expanded with a readfirstlane
+ // sequence. This may have multiple uses if moving a sequence.
+ DenseMap<Register, Register> WaterfalledRegMap;
+
+ MachineBasicBlock &MBB = B.getMBB();
+ MachineFunction &MF = B.getMF();
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
+ unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
+ if (IsWave32) {
+ MovExecOpc = AMDGPU::S_MOV_B32;
+ MovExecTermOpc = AMDGPU::S_MOV_B32_term;
+ XorTermOpc = AMDGPU::S_XOR_B32_term;
+ AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
+ ExecReg = AMDGPU::EXEC_LO;
+ } else {
+ MovExecOpc = AMDGPU::S_MOV_B64;
+ MovExecTermOpc = AMDGPU::S_MOV_B64_term;
+ XorTermOpc = AMDGPU::S_XOR_B64_term;
+ AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
+ ExecReg = AMDGPU::EXEC;
+ }
+
+#ifndef NDEBUG
+ const int OrigRangeSize = std::distance(Range.begin(), Range.end());
+#endif
+
+ MachineRegisterInfo &MRI = *B.getMRI();
+ Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
+ Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
+
+ // Don't bother using generic instructions/registers for the exec mask.
+ B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
+
+ Register SavedExec = MRI.createVirtualRegister(WaveRC);
+
+ // To insert the loop we need to split the block. Move everything before
+ // this point to a new block, and insert a new empty block before this
+ // instruction.
+ MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
+ MachineFunction::iterator MBBI(MBB);
+ ++MBBI;
+ MF.insert(MBBI, LoopBB);
+ MF.insert(MBBI, BodyBB);
+ MF.insert(MBBI, RestoreExecBB);
+ MF.insert(MBBI, RemainderBB);
+
+ LoopBB->addSuccessor(BodyBB);
+ BodyBB->addSuccessor(RestoreExecBB);
+ BodyBB->addSuccessor(LoopBB);
+
+ // Move the rest of the block into a new block.
+ RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
+ RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
+
+ MBB.addSuccessor(LoopBB);
+ RestoreExecBB->addSuccessor(RemainderBB);
+
+ B.setInsertPt(*LoopBB, LoopBB->end());
+
+ // +-MBB:------------+
+ // | ... |
+ // | %0 = G_INST_1 |
+ // | %Dst = MI %Vgpr |
+ // | %1 = G_INST_2 |
+ // | ... |
+ // +-----------------+
+ // ->
+ // +-MBB-------------------------------+
+ // | ... |
+ // | %0 = G_INST_1 |
+ // | %SaveExecReg = S_MOV_B32 $exec_lo |
+ // +----------------|------------------+
+ // | /------------------------------|
+ // V V |
+ // +-LoopBB---------------------------------------------------------------+ |
+ // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
+ // | instead of executing for each lane, see if other lanes had | |
+ // | same value for %Vgpr and execute for them also. | |
+ // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
+ // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
+ // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
+ // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
+ // +----------------|-----------------------------------------------------+ |
+ // V |
+ // +-BodyBB------------------------------------------------------------+ |
+ // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
+ // | executed only for active lanes and written to Dst | |
+ // | $exec = S_XOR_B32 $exec, %SavedExec | |
+ // | set active lanes to 0 in SavedExec, lanes that did not write to | |
+ // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
+ // | SI_WATERFALL_LOOP LoopBB |-----|
+ // +----------------|--------------------------------------------------+
+ // V
+ // +-RestoreExecBB--------------------------+
+ // | $exec_lo = S_MOV_B32_term %SaveExecReg |
+ // +----------------|-----------------------+
+ // V
+ // +-RemainderBB:----------------------+
+ // | %1 = G_INST_2 |
+ // | ... |
+ // +---------------------------------- +
+
+ // Move the instruction into the loop body. Note we moved everything after
+ // Range.end() already into a new block, so Range.end() is no longer valid.
+ BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
+
+ // Figure out the iterator range after splicing the instructions.
+ MachineBasicBlock::iterator NewBegin = Range.begin()->getIterator();
+ auto NewEnd = BodyBB->end();
+ assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
+
+ B.setMBB(*LoopBB);
+ Register CondReg;
+
+ for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
+ for (MachineOperand &Op : MI.all_uses()) {
+ Register OldReg = Op.getReg();
+ if (!SGPROperandRegs.count(OldReg))
+ continue;
+
+ // See if we already processed this register in another instruction in
+ // the sequence.
+ auto OldVal = WaterfalledRegMap.find(OldReg);
+ if (OldVal != WaterfalledRegMap.end()) {
+ Op.setReg(OldVal->second);
+ continue;
+ }
+
+ Register OpReg = Op.getReg();
+ LLT OpTy = MRI.getType(OpReg);
+
+ // TODO: support for agpr
+ assert(MRI.getRegBank(OpReg) == VgprRB);
+ Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
+ buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
+
+ // Build the comparison(s), CurrentLaneReg == OpReg.
+ unsigned OpSize = OpTy.getSizeInBits();
+ unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
+ LLT PartTy = LLT::scalar(PartSize);
+ unsigned NumParts = OpSize / PartSize;
+ SmallVector<Register, 8> OpParts;
+ SmallVector<Register, 8> CurrentLaneParts;
+
+ if (NumParts == 1) {
+ OpParts.push_back(OpReg);
+ CurrentLaneParts.push_back(CurrentLaneReg);
+ } else {
+ auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
+ auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
+ for (unsigned i = 0; i < NumParts; ++i) {
+ OpParts.push_back(UnmergeOp.getReg(i));
+ CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
+ }
+ }
+
+ for (unsigned i = 0; i < NumParts; ++i) {
+ Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
+ B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
+
+ if (!CondReg)
+ CondReg = CmpReg;
+ else
+ CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
+ }
+
+ Op.setReg(CurrentLaneReg);
+
+ // Make sure we don't re-process this register again.
+ WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
+ }
+ }
+
+ // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
+ Register CondRegLM =
+ MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
+ B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
+
+ // Update EXEC, save the original EXEC value to SavedExec.
+ B.buildInstr(AndSaveExecOpc)
+ .addDef(SavedExec)
+ .addReg(CondRegLM, RegState::Kill);
+ MRI.setSimpleHint(SavedExec, CondRegLM);
+
+ B.setInsertPt(*BodyBB, BodyBB->end());
+
+ // Update EXEC, switch all done bits to 0 and all todo bits to 1.
+ B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
+
+ // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
+ // s_cbranch_scc0?
+
+ // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
+ B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
+
+ // Save the EXEC mask before the loop.
+ B.setInsertPt(MBB, MBB.end());
+ B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
+
+ // Restore the EXEC mask after the loop.
+ B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
+ B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
+
+ // Set the insert point after the original instruction, so any new
+ // instructions will be in the remainder.
+ B.setInsertPt(*RemainderBB, RemainderBB->begin());
+
+ return true;
+}
+
void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
MachineFunction &MF = B.getMF();
@@ -391,7 +609,7 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
switch (Mapping.LoweringMethod) {
case DoNotLower:
- return;
+ break;
case VccExtToSel:
return lowerVccExtToSel(MI);
case UniExtToSel: {
@@ -527,7 +745,10 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
}
}
- // TODO: executeInWaterfallLoop(... WaterfallSgprs)
+ if (!WaterfallSgprs.empty()) {
+ MachineBasicBlock::iterator I = MI.getIterator();
+ executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs);
+ }
}
LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
@@ -539,6 +760,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
case Vgpr16:
return LLT::scalar(16);
case Sgpr32:
+ case Sgpr32_WF:
case Sgpr32Trunc:
case Sgpr32AExt:
case Sgpr32AExtBoolInReg:
@@ -577,6 +799,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
case VgprV2S32:
return LLT::fixed_vector(2, 32);
case SgprV4S32:
+ case SgprV4S32_WF:
case VgprV4S32:
case UniInVgprV4S32:
return LLT::fixed_vector(4, 32);
@@ -650,6 +873,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
return VccRB;
case Sgpr16:
case Sgpr32:
+ case Sgpr32_WF:
case Sgpr64:
case Sgpr128:
case SgprP1:
@@ -662,6 +886,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case SgprV2S16:
case SgprV2S32:
case SgprV4S32:
+ case SgprV4S32_WF:
case SgprB32:
case SgprB64:
case SgprB96:
@@ -923,6 +1148,14 @@ void RegBankLegalizeHelper::applyMappingSrc(
}
break;
}
+ // sgpr waterfall, scalars and vectors
+ case Sgpr32_WF:
+ case SgprV4S32_WF: {
+ assert(Ty == getTyFromID(MethodIDs[i]));
+ if (RB != SgprRB)
+ SgprWaterfallOperandRegs.insert(Reg);
+ break;
+ }
// sgpr and vgpr scalars with extend
case Sgpr32AExt: {
// Note: this ext allows S1, and it is meant to be combined away.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index 08cc7d4..db965d8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -32,6 +32,7 @@ class RegBankLegalizeHelper {
const MachineUniformityInfo &MUI;
const RegisterBankInfo &RBI;
const RegBankLegalizeRules &RBLRules;
+ const bool IsWave32;
const RegisterBank *SgprRB;
const RegisterBank *VgprRB;
const RegisterBank *VccRB;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index a60855c..5a6ad40 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -529,7 +529,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
addRulesForGOpcs({G_ICMP})
.Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
- .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}});
+ .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
+ .Any({{DivS1, _, S64}, {{Vcc}, {None, Vgpr64, Vgpr64}}});
addRulesForGOpcs({G_FCMP})
.Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}})
@@ -666,11 +667,15 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
// clang-format off
addRulesForGOpcs({G_LOAD})
.Any({{DivB32, DivP0}, {{VgprB32}, {VgprP0}}})
+ .Any({{DivB32, UniP0}, {{VgprB32}, {VgprP0}}})
.Any({{DivB32, DivP1}, {{VgprB32}, {VgprP1}}})
.Any({{{UniB256, UniP1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}})
.Any({{{UniB512, UniP1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}})
.Any({{{UniB32, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}})
+ .Any({{{UniB64, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP1}}})
+ .Any({{{UniB96, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP1}}})
+ .Any({{{UniB128, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP1}}})
.Any({{{UniB256, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP1}, SplitLoad}})
.Any({{{UniB512, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP1}, SplitLoad}})
@@ -684,6 +689,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{{UniB96, UniP4}, isAlign16 && isUL}, {{SgprB96}, {SgprP4}, WidenLoad}}, !hasUnalignedLoads)
.Any({{{UniB96, UniP4}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP4}, SplitLoad}}, !hasUnalignedLoads)
.Any({{{UniB96, UniP4}, isAlign4 && isUL}, {{SgprB96}, {SgprP4}}}, hasUnalignedLoads)
+ .Any({{{UniB128, UniP4}, isAlign4 && isUL}, {{SgprB128}, {SgprP4}}})
.Any({{{UniB256, UniP4}, isAlign4 && isUL}, {{SgprB256}, {SgprP4}}})
.Any({{{UniB512, UniP4}, isAlign4 && isUL}, {{SgprB512}, {SgprP4}}})
.Any({{{UniB32, UniP4}, !isNaturalAlignedSmall || !isUL}, {{UniInVgprB32}, {VgprP4}}}, hasSMRDSmall) // i8 and i16 load
@@ -698,11 +704,15 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{{UniB32, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP4}}});
// clang-format on
- addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, Vector)
- .Div(S32, {{Vgpr32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
- .Uni(S32, {{UniInVgprS32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
- .Div(V4S32, {{VgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
- .Uni(V4S32, {{UniInVgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}});
+ addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, StandardB)
+ .Div(B32, {{VgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B32, {{UniInVgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Div(B64, {{VgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B64, {{UniInVgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Div(B96, {{VgprB96}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B96, {{UniInVgprB96}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Div(B128, {{VgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+ .Uni(B128, {{UniInVgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}});
addRulesForGOpcs({G_STORE})
.Any({{S32, P0}, {{}, {Vgpr32, VgprP0}}})
@@ -716,7 +726,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
addRulesForGOpcs({G_PTR_ADD})
.Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}})
.Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}})
- .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}});
+ .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}})
+ .Any({{UniP4}, {{SgprP4}, {SgprP4, Sgpr64}}});
addRulesForGOpcs({G_INTTOPTR})
.Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}})
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index 7243d75..1391440 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -188,7 +188,11 @@ enum RegBankLLTMappingApplyID {
Sgpr32Trunc,
- // Src only modifiers: waterfalls, extends
+ // Src only modifiers: execute in waterfall loop if divergent
+ Sgpr32_WF,
+ SgprV4S32_WF,
+
+ // Src only modifiers: extends
Sgpr32AExt,
Sgpr32AExtBoolInReg,
Sgpr32SExt,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index f1caf24..787db67 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2528,7 +2528,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// Special case for s_mul_u64. There is not a vector equivalent of
// s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
// multiplications.
- if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) {
+ if (!Subtarget.hasVectorMulU64() && Opc == AMDGPU::G_MUL &&
+ DstTy.getSizeInBits() == 64) {
applyMappingSMULU64(B, OpdMapper);
return;
}
@@ -3500,19 +3501,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
applyMappingMAD_64_32(B, OpdMapper);
return;
case AMDGPU::G_PREFETCH: {
- if (!Subtarget.hasPrefetch() || !Subtarget.hasSafeSmemPrefetch()) {
+ if (!Subtarget.hasSafeSmemPrefetch() && !Subtarget.hasVmemPrefInsts()) {
MI.eraseFromParent();
return;
}
Register PtrReg = MI.getOperand(0).getReg();
unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID);
- if (PtrBank == AMDGPU::VGPRRegBankID) {
+ if (PtrBank == AMDGPU::VGPRRegBankID &&
+ (!Subtarget.hasVmemPrefInsts() || !MI.getOperand(3).getImm())) {
+ // Cannot do I$ prefetch with divergent pointer.
MI.eraseFromParent();
return;
}
unsigned AS = MRI.getType(PtrReg).getAddressSpace();
- if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
- AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+ if ((!AMDGPU::isFlatGlobalAddrSpace(AS) &&
+ AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
+ (!Subtarget.hasSafeSmemPrefetch() &&
+ (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
+ !MI.getOperand(3).getImm() /* I$ prefetch */))) {
MI.eraseFromParent();
return;
}
@@ -3973,7 +3979,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
} else {
- OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
+ if (MI.getOpcode() == AMDGPU::G_MUL && Subtarget.hasVectorMulU64())
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ else
+ OpdsMapping[0] =
+ getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
@@ -5432,6 +5442,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
break;
}
+ case Intrinsic::amdgcn_flat_prefetch:
+ case Intrinsic::amdgcn_global_prefetch:
+ return getDefaultMappingVOP(MI);
default:
return getInvalidInstructionMapping();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 1e44be8..6878744 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -61,6 +61,7 @@ protected:
bool EnableRealTrue16Insts = false;
bool HasBF16TransInsts = false;
bool HasBF16ConversionInsts = false;
+ bool HasBF16PackedInsts = false;
bool HasMadMixInsts = false;
bool HasMadMacF32Insts = false;
bool HasDsSrc2Insts = false;
@@ -209,6 +210,8 @@ public:
return HasBF16ConversionInsts;
}
+ bool hasBF16PackedInsts() const { return HasBF16PackedInsts; }
+
bool hasMadMixInsts() const {
return HasMadMixInsts;
}
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index dc83230..421fc42 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -5139,13 +5139,45 @@ bool AMDGPUAsmParser::validateAGPRLdSt(const MCInst &Inst) const {
bool AMDGPUAsmParser::validateVGPRAlign(const MCInst &Inst) const {
auto FB = getFeatureBits();
+ if (!FB[AMDGPU::FeatureGFX90AInsts] && !FB[AMDGPU::FeatureGFX1250Insts])
+ return true;
+
unsigned Opc = Inst.getOpcode();
+ const MCRegisterInfo *MRI = getMRI();
// DS_READ_B96_TR_B6 is the only DS instruction in GFX950, that allows
// unaligned VGPR. All others only allow even aligned VGPRs.
- if (!(FB[AMDGPU::FeatureGFX90AInsts]) || Opc == AMDGPU::DS_READ_B96_TR_B6_vi)
+ if (FB[AMDGPU::FeatureGFX90AInsts] && Opc == AMDGPU::DS_READ_B96_TR_B6_vi)
return true;
- const MCRegisterInfo *MRI = getMRI();
+ if (FB[AMDGPU::FeatureGFX1250Insts]) {
+ switch (Opc) {
+ default:
+ break;
+ case AMDGPU::DS_LOAD_TR6_B96:
+ case AMDGPU::DS_LOAD_TR6_B96_gfx12:
+ // DS_LOAD_TR6_B96 is the only DS instruction in GFX1250, that
+ // allows unaligned VGPR. All others only allow even aligned VGPRs.
+ return true;
+ case AMDGPU::GLOBAL_LOAD_TR6_B96:
+ case AMDGPU::GLOBAL_LOAD_TR6_B96_gfx1250: {
+ // GLOBAL_LOAD_TR6_B96 is the only GLOBAL instruction in GFX1250, that
+ // allows unaligned VGPR for vdst, but other operands still only allow
+ // even aligned VGPRs.
+ int VAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
+ if (VAddrIdx != -1) {
+ const MCOperand &Op = Inst.getOperand(VAddrIdx);
+ MCRegister Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
+ if ((Sub - AMDGPU::VGPR0) & 1)
+ return false;
+ }
+ return true;
+ }
+ case AMDGPU::GLOBAL_LOAD_TR6_B96_SADDR:
+ case AMDGPU::GLOBAL_LOAD_TR6_B96_SADDR_gfx1250:
+ return true;
+ }
+ }
+
const MCRegisterClass &VGPR32 = MRI->getRegClass(AMDGPU::VGPR_32RegClassID);
const MCRegisterClass &AGPR32 = MRI->getRegClass(AMDGPU::AGPR_32RegClassID);
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
@@ -5292,6 +5324,12 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
unsigned CPol = Inst.getOperand(CPolPos).getImm();
if (!isGFX1250()) {
+ if (CPol & CPol::SCAL) {
+ SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
+ StringRef CStr(S.getPointer());
+ S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scale_offset")]);
+ Error(S, "scale_offset is not supported on this GPU");
+ }
if (CPol & CPol::NV) {
SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
StringRef CStr(S.getPointer());
@@ -5300,6 +5338,13 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
}
}
+ if ((CPol & CPol::SCAL) && !supportsScaleOffset(MII, Inst.getOpcode())) {
+ SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
+ StringRef CStr(S.getPointer());
+ S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scale_offset")]);
+ Error(S, "scale_offset is not supported for this instruction");
+ }
+
if (isGFX12Plus())
return validateTHAndScopeBits(Inst, Operands, CPol);
@@ -6971,6 +7016,7 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
ParseStatus ResTH = ParseStatus::NoMatch;
ParseStatus ResScope = ParseStatus::NoMatch;
ParseStatus ResNV = ParseStatus::NoMatch;
+ ParseStatus ResScal = ParseStatus::NoMatch;
for (;;) {
if (ResTH.isNoMatch()) {
@@ -7009,10 +7055,22 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
}
}
+ if (ResScal.isNoMatch()) {
+ if (trySkipId("scale_offset")) {
+ ResScal = ParseStatus::Success;
+ CPolVal |= CPol::SCAL;
+ continue;
+ } else if (trySkipId("no", "scale_offset")) {
+ ResScal = ParseStatus::Success;
+ continue;
+ }
+ }
+
break;
}
- if (ResTH.isNoMatch() && ResScope.isNoMatch() && ResNV.isNoMatch())
+ if (ResTH.isNoMatch() && ResScope.isNoMatch() && ResNV.isNoMatch() &&
+ ResScal.isNoMatch())
return ParseStatus::NoMatch;
Operands.push_back(AMDGPUOperand::CreateImm(this, CPolVal, StringLoc,
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index f7f29f1..5ccf1e5 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -14,7 +14,7 @@ let WantsRoot = true in {
def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>;
def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>;
def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [], -10>;
- def ScratchSVAddr : ComplexPattern<iPTR, 3, "SelectScratchSVAddr", [], [], -10>;
+ def ScratchSVAddr : ComplexPattern<iPTR, 4, "SelectScratchSVAddr", [], [], -10>;
}
class True16D16Table <string hiOp, string loOp> {
@@ -464,6 +464,37 @@ class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = n
let sve = 0;
}
+class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64:$vaddr), string asm = " $vaddr"> :
+ FLAT_Pseudo<opName, (outs), !con(addr, (ins flat_offset:$offset, CPol_0:$cpol)), asm#"$offset$cpol"> {
+ let has_vdst = 0;
+ let has_data = 0;
+ let mayLoad = 1;
+ let mayStore = 1;
+ let VM_CNT = 0;
+ let LGKM_CNT = 0;
+}
+
+multiclass FLAT_Flat_Prefetch_Pseudo<string opName> {
+ def "" : FLAT_Prefetch_Pseudo<opName>,
+ GlobalSaddrTable<0, opName>;
+ def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">,
+ GlobalSaddrTable<1, opName> {
+ let OtherPredicates = [HasFlatGVSMode];
+ let enabled_saddr = 1;
+ }
+}
+
+multiclass FLAT_Global_Prefetch_Pseudo<string opName> {
+ let is_flat_global = 1, has_saddr = 1 in {
+ def "" : FLAT_Prefetch_Pseudo<opName, (ins VReg_64:$vaddr), " $vaddr, off">,
+ GlobalSaddrTable<0, opName>;
+ def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">,
+ GlobalSaddrTable<1, opName> {
+ let enabled_saddr = 1;
+ }
+ }
+}
+
class FlatScratchInst <string sv_op, string mode> {
string SVOp = sv_op;
string Mode = mode;
@@ -1162,6 +1193,16 @@ defm SCRATCH_LOAD_LDS_USHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_u
defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sshort">;
defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_dword">;
+let SubtargetPredicate = isGFX125xOnly in {
+defm FLAT_LOAD_MONITOR_B32 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b32", VGPR_32>;
+defm FLAT_LOAD_MONITOR_B64 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b64", VReg_64>;
+defm FLAT_LOAD_MONITOR_B128 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b128", VReg_128>;
+
+defm GLOBAL_LOAD_MONITOR_B32 : FLAT_Global_Load_Pseudo <"global_load_monitor_b32", VGPR_32>;
+defm GLOBAL_LOAD_MONITOR_B64 : FLAT_Global_Load_Pseudo <"global_load_monitor_b64", VReg_64>;
+defm GLOBAL_LOAD_MONITOR_B128 : FLAT_Global_Load_Pseudo <"global_load_monitor_b128", VReg_128>;
+} // End SubtargetPredicate = isGFX125xOnly
+
let SubtargetPredicate = isGFX12Plus in {
let Uses = [EXEC, M0] in {
defm GLOBAL_LOAD_BLOCK : FLAT_Global_Load_Pseudo <"global_load_block", VReg_1024>;
@@ -1218,6 +1259,11 @@ let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in
"global_atomic_pk_add_f16", VGPR_32, v2f16
>;
+let SubtargetPredicate = HasVmemPrefInsts in {
+ defm FLAT_PREFETCH_B8 : FLAT_Flat_Prefetch_Pseudo<"flat_prefetch_b8">;
+ defm GLOBAL_PREFETCH_B8 : FLAT_Global_Prefetch_Pseudo<"global_prefetch_b8">;
+}
+
//===----------------------------------------------------------------------===//
// Flat Patterns
//===----------------------------------------------------------------------===//
@@ -1443,19 +1489,19 @@ class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
>;
class ScratchLoadSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset))),
- (inst $vaddr, $saddr, $offset, 0)
+ (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
+ (inst $vaddr, $saddr, $offset, $cpol)
>;
class ScratchStoreSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat <
- (node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset)),
- (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $saddr, $offset)
+ (node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol)),
+ (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $saddr, $offset, $cpol)
>;
class ScratchLoadSVaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset), vt:$in)),
- (inst $vaddr, $saddr, $offset, 0, $in)
+ (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol), vt:$in)),
+ (inst $vaddr, $saddr, $offset, $cpol, $in)
>;
class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -2138,6 +2184,77 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f
} // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch]
+def PrefetchLoc: SDNodeXForm<timm, [{
+ uint32_t V = N->getZExtValue();
+ V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK)) << AMDGPU::CPol::SCOPE_SHIFT;
+ if (!Subtarget->hasSafeCUPrefetch())
+ V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
+ return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32);
+}]>;
+
+def prefetch_flat : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
+ (prefetch node:$ptr, node:$rw, node:$loc, node:$type),
+ [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; }]> {
+ let GISelPredicateCode = [{
+ return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
+ }];
+}
+
+def prefetch_global : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
+ (prefetch node:$ptr, node:$rw, node:$loc, node:$type),
+ [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+ (cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+ !Subtarget->hasSafeSmemPrefetch()); }]> {
+ let GISelPredicateCode = [{
+ return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+ ((*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+ !Subtarget->hasSafeSmemPrefetch());
+ }];
+}
+
+multiclass FlatPrefetchPats<string inst, SDPatternOperator prefetch_kind, SDPatternOperator rw> {
+ def : GCNPat <
+ (prefetch_kind (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), rw, (i32 timm:$loc), i32imm_one),
+ (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, (i32 (PrefetchLoc $loc)))
+ > {
+ let AddedComplexity = !if(!eq(rw, i32imm_zero), 0, 25);
+ }
+
+ def : GCNPat <
+ (prefetch_kind (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), rw, (i32 timm:$loc), i32imm_one),
+ (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, (i32 (PrefetchLoc $loc)))
+ > {
+ let AddedComplexity = !if(!eq(rw, i32imm_zero), 11, 30);
+ }
+}
+
+multiclass FlatIntrPrefetchPats<string inst, SDPatternOperator intr> {
+ def : GCNPat <
+ (intr (FlatOffset i64:$vaddr, i32:$offset), timm:$cpol),
+ (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, $cpol)
+ >;
+
+ def : GCNPat <
+ (intr (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), timm:$cpol),
+ (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, $cpol)> {
+ let AddedComplexity = 11;
+ }
+}
+
+let SubtargetPredicate = HasVmemPrefInsts in {
+ defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_zero>;
+ defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_zero>;
+
+ // Patterns for forced vector prefetch with rw = 1.
+ defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_one>;
+ defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_one>;
+
+
+ // Patterns for target intrinsics
+ defm : FlatIntrPrefetchPats<"FLAT_PREFETCH_B8", int_amdgcn_flat_prefetch>;
+ defm : FlatIntrPrefetchPats<"GLOBAL_PREFETCH_B8", int_amdgcn_global_prefetch>;
+} // End SubtargetPredicate = HasVmemPrefInsts
+
//===----------------------------------------------------------------------===//
// Target
//===----------------------------------------------------------------------===//
@@ -2941,6 +3058,7 @@ multiclass VFLAT_Real_gfx12 <bits<8> op, string name = get_FLAT_ps<NAME>.Mnemoni
let DecoderNamespace = "GFX12";
let Inst{25-24} = {ps.is_flat_global, ps.is_flat_scratch};
+ let Inst{48} = cpol{CPolBit.SCAL}; // scale offset
}
}
@@ -3170,6 +3288,7 @@ multiclass VFLAT_Real_gfx1250<bits<8> op,
let DecoderNamespace = "GFX1250";
let Inst{25-24} = {ps.is_flat_global, ps.is_flat_scratch};
+ let Inst{48} = cpol{CPolBit.SCAL}; // scale offset
}
}
@@ -3208,6 +3327,17 @@ multiclass VFLAT_Real_Atomics_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME
defm TENSOR_SAVE : VFLAT_Real_gfx1250<0x06e>;
defm TENSOR_STOP : VFLAT_Real_gfx1250<0x06f>;
+defm FLAT_PREFETCH_B8 : VFLAT_Real_AllAddr_gfx1250<0x05d>;
+defm GLOBAL_PREFETCH_B8 : VFLAT_Real_AllAddr_gfx1250<0x05d>;
+
+defm FLAT_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>;
+defm FLAT_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>;
+defm FLAT_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>;
+
+defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>;
+defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>;
+defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>;
+
defm GLOBAL_LOAD_TR_B128_w32 : VFLAT_Real_AllAddr_gfx1250<0x057, "global_load_tr16_b128">;
defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "global_load_tr8_b64">;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index c4a3be4..94886b0 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -520,8 +520,8 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
const MachineInstr *MI, IsExpiredFn IsExpired) {
DenseSet<const MachineBasicBlock *> Visited;
return getWaitStatesSince(IsHazard, MI->getParent(),
- std::next(MI->getReverseIterator()),
- 0, IsExpired, Visited);
+ std::next(MI->getReverseIterator()), 0, IsExpired,
+ Visited, SIInstrInfo::getNumWaitStates);
}
int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
@@ -1190,7 +1190,8 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixVALUPartialForwardingHazard(MI);
fixVALUTransUseHazard(MI);
fixVALUTransCoexecutionHazards(MI);
- fixWMMAHazards(MI);
+ fixWMMAHazards(MI); // fall-through if co-execution is enabled.
+ fixWMMACoexecutionHazards(MI);
fixShift64HighRegBug(MI);
fixVALUMaskWriteHazard(MI);
fixRequiredExportPriority(MI);
@@ -1909,6 +1910,182 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
return true;
}
+static bool isCoexecutableVALUInst(const MachineInstr &MI) {
+ return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isTRANS(MI) &&
+ !SIInstrInfo::isWMMA(MI) && !SIInstrInfo::isSWMMAC(MI); // What else?
+}
+
+static bool IsWMMAHazardInstInCategory(const MachineInstr &MI,
+ const SIInstrInfo *TII, unsigned Latency,
+ unsigned Category) {
+ assert(TII->isXDLWMMA(MI) && (Latency == 8 || Latency == 16) &&
+ "Handle me if the xdl wmma instruction latency changes");
+
+ switch (Category) {
+ case 0: // Dense WMMA Instructions:
+ // WMMA_*F16, WMMA_*BF16
+ // WMMA_*FP8FP8
+ // WMMA_*FP8BF8
+ // WMMA_*BF8FP8
+ // WMMA_*BF8BF8
+ // WMMA_*F8F6F4 if SRCA & SRCB != F8
+ return Latency == 8 && SIInstrInfo::isWMMA(MI);
+
+ case 1: // Dense WMMA Instructions:
+ // WMMA_IU8
+ // WMMA_IU4
+ // WMMA_*F8F6F4 if SRCA OR SRCB == F8
+ return Latency == 16 && SIInstrInfo::isWMMA(MI);
+
+ case 2: // Dense SWMMAC Instructions
+ // SWMMAC_*F16, SWMMAC_*BF16,
+ // SWMMAC_*FP8FP8
+ // SWMMAC_*BF8FP8
+ // SWMMAC_*FP8BF8
+ // SWMMAC_*BF8BF8
+ return Latency == 8 && SIInstrInfo::isSWMMAC(MI);
+
+ case 3: // Sparse WMMA Instructions:
+ // SWMMAC_IU8
+ // SWMMAC_IU4
+ return Latency == 16 && SIInstrInfo::isSWMMAC(MI);
+ default:
+ break;
+ } // end switch.
+
+ return false;
+}
+
+bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
+ if (!AMDGPU::isGFX1250(ST))
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
+ return false;
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ // WaitStates here is the number of V_NOPs or unrelated VALU instructions must
+ // be in between the first WMMA and the second instruction to cover the hazard
+ // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
+ // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
+ // numbers, which depends on the category of the first WMMA.
+ const int WMMAWaitStates[] = {5, 9, 3, 5};
+ const int VALUWaitStates[] = {4, 8, 2, 4};
+ unsigned Category = 0;
+
+ auto IsWMMAHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
+ if (!TII->isXDLWMMA(I))
+ return false;
+
+ unsigned Latency = TSchedModel.computeInstrLatency(&I);
+ if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
+ return false;
+
+ Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
+ Register A1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
+ Register B1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
+
+ // WMMA0 wrires (D0), WMMA1 reads (A1/B1/Idx1).
+ if (TRI->regsOverlap(D0, A1) || TRI->regsOverlap(D0, B1))
+ return true;
+
+ if (SIInstrInfo::isSWMMAC(*MI)) {
+ Register Idx1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
+ if (TRI->regsOverlap(D0, Idx1))
+ return true;
+ }
+
+ return false;
+ };
+
+ auto IsVALUHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
+ if (!TII->isXDLWMMA(I))
+ return false;
+
+ unsigned Latency = TSchedModel.computeInstrLatency(&I);
+ if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
+ return false;
+
+ // WMMA writes, VALU reads.
+ Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
+ for (const MachineOperand &ValuUse : MI->explicit_uses()) {
+ if (ValuUse.isReg() && TRI->regsOverlap(D0, ValuUse.getReg()))
+ return true;
+ }
+
+ auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
+ if (!ValuDst || !ValuDst->isReg())
+ return false;
+ Register D1 = ValuDst->getReg();
+
+ // WMMA writes, VALU writes.
+ if (TRI->regsOverlap(D0, D1))
+ return true;
+
+ // WMMA reads, VALU writes.
+ Register A0 = TII->getNamedOperand(I, AMDGPU::OpName::src0)->getReg();
+ Register B0 = TII->getNamedOperand(I, AMDGPU::OpName::src1)->getReg();
+ if (TRI->regsOverlap(A0, D1) || TRI->regsOverlap(B0, D1))
+ return true;
+
+ if (SIInstrInfo::isSWMMAC(I)) {
+ Register Idx0 = TII->getNamedOperand(I, AMDGPU::OpName::src2)->getReg();
+ if (TRI->regsOverlap(D1, Idx0))
+ return true;
+ }
+
+ return false;
+ };
+
+ int Limit = 0;
+ auto IsExpiredFn = [&Limit](const MachineInstr &, int WaitStates) {
+ return WaitStates >= Limit;
+ };
+
+ auto GetWaitStatesFn = [](const MachineInstr &I) {
+ return SIInstrInfo::isVALU(I) ? 1 : 0;
+ };
+
+ int WaitStatesNeeded = -1;
+ if (TII->isXDLWMMA(*MI)) {
+ for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
+ Limit = WMMAWaitStates[Category]; // for IsExpiredFn.
+ DenseSet<const MachineBasicBlock *> Visited;
+ // '::getWaitStatesSince' returns the number of VALUs in between if hazard
+ // exists, and INT_MAX if there is no hazard. As a result, a negative
+ // WaitStatesNeeded here means no hazard, and we will continue to search
+ // for other categories.
+ WaitStatesNeeded =
+ Limit - ::getWaitStatesSince(IsWMMAHazardFn, MI->getParent(),
+ std::next(MI->getReverseIterator()), 0,
+ IsExpiredFn, Visited, GetWaitStatesFn);
+ }
+ } else { // Must be a co-executable VALU.
+ for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
+ Limit = VALUWaitStates[Category]; // for IsExpiredFn.
+ DenseSet<const MachineBasicBlock *> Visited;
+ // '::getWaitStatesSince' returns the number of VALUs in between if hazard
+ // exists, and INT_MAX if there is no hazard. As a result, a negative
+ // WaitStatesNeeded here means no hazard, and we will continue to search
+ // for other categories.
+ WaitStatesNeeded =
+ Limit - ::getWaitStatesSince(IsVALUHazardFn, MI->getParent(),
+ std::next(MI->getReverseIterator()), 0,
+ IsExpiredFn, Visited, GetWaitStatesFn);
+ }
+ }
+
+ // WaitStatesNeeded now is the number of V_NOPs we need to insert, negative
+ // means not needed.
+ for (int i = 0; i < WaitStatesNeeded; i++)
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::V_NOP_e32));
+
+ return true;
+}
+
bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
if (!ST.hasShift64HighRegBug())
return false;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index ef6ddd8..f796eeae 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -106,6 +106,7 @@ private:
bool fixVALUTransUseHazard(MachineInstr *MI);
bool fixVALUTransCoexecutionHazards(MachineInstr *MI);
bool fixWMMAHazards(MachineInstr *MI);
+ bool fixWMMACoexecutionHazards(MachineInstr *MI);
bool fixShift64HighRegBug(MachineInstr *MI);
bool fixVALUMaskWriteHazard(MachineInstr *MI);
bool fixRequiredExportPriority(MachineInstr *MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 7b8f0f4..9a2bab1 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -324,7 +324,7 @@ bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
}
void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const {
+ const SchedRegion &Region) const {
// Track register pressure so the scheduler can try to decrease
// pressure once register usage is above the threshold defined by
// SIRegisterInfo::getRegPressureSetLimit()
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 268162b..88a269f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -123,6 +123,7 @@ protected:
bool HasSMemRealTime = false;
bool HasIntClamp = false;
bool HasFmaMixInsts = false;
+ bool HasFmaMixBF16Insts = false;
bool HasMovrel = false;
bool HasVGPRIndexMode = false;
bool HasScalarDwordx3Loads = false;
@@ -244,7 +245,9 @@ protected:
bool HasVMEMtoScalarWriteHazard = false;
bool HasSMEMtoVectorWriteHazard = false;
bool HasInstFwdPrefetchBug = false;
+ bool HasVmemPrefInsts = false;
bool HasSafeSmemPrefetch = false;
+ bool HasSafeCUPrefetch = false;
bool HasVcmpxExecWARHazard = false;
bool HasLdsBranchVmemWARHazard = false;
bool HasNSAtoVMEMBug = false;
@@ -265,8 +268,10 @@ protected:
bool HasIEEEMinimumMaximumInsts = false;
bool HasMinimum3Maximum3F32 = false;
bool HasMinimum3Maximum3F16 = false;
+ bool HasMin3Max3PKF16 = false;
bool HasMinimum3Maximum3PKF16 = false;
bool HasLshlAddU64Inst = false;
+ bool HasAddSubU64Insts = false;
bool HasPointSampleAccel = false;
bool HasLdsBarrierArriveAtomic = false;
bool HasSetPrioIncWgInst = false;
@@ -460,6 +465,8 @@ public:
return HasFmaMixInsts;
}
+ bool hasFmaMixBF16Insts() const { return HasFmaMixBF16Insts; }
+
bool hasCARRY() const {
return true;
}
@@ -985,8 +992,12 @@ public:
bool hasPrefetch() const { return GFX12Insts; }
+ bool hasVmemPrefInsts() const { return HasVmemPrefInsts; }
+
bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; }
+ bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; }
+
// Has s_cmpk_* instructions.
bool hasSCmpK() const { return getGeneration() < GFX12; }
@@ -1022,7 +1033,7 @@ public:
}
void overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const override;
+ const SchedRegion &Region) const override;
void mirFileLoaded(MachineFunction &MF) const override;
@@ -1162,8 +1173,14 @@ public:
bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; }
+ // Scalar and global loads support scale_offset bit.
+ bool hasScaleOffset() const { return GFX1250Insts; }
+
bool hasFlatGVSMode() const { return FlatGVSMode; }
+ // FLAT GLOBAL VOffset is signed
+ bool hasSignedGVSOffset() const { return GFX1250Insts; }
+
bool enableSIScheduler() const {
return EnableSIScheduler;
}
@@ -1300,7 +1317,7 @@ public:
bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
- bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; }
+ bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; }
/// Return if operations acting on VGPR tuples require even alignment.
bool needsAlignedVGPRs() const { return GFX90AInsts || GFX1250Insts; }
@@ -1381,6 +1398,8 @@ public:
return HasMinimum3Maximum3F16;
}
+ bool hasMin3Max3PKF16() const { return HasMin3Max3PKF16; }
+
bool hasTanhInsts() const { return HasTanhInsts; }
bool hasAddPC64Inst() const { return GFX1250Insts; }
@@ -1494,6 +1513,18 @@ public:
bool hasVOPD3() const { return GFX1250Insts; }
+ // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions.
+ bool hasAddSubU64Insts() const { return HasAddSubU64Insts; }
+
+ // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
+ bool hasVectorMulU64() const { return GFX1250Insts; }
+
+ // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
+ bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
+
+ // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
+ bool hasPkMinMax3Insts() const { return GFX1250Insts; }
+
// \returns true if target has S_SETPRIO_INC_WG instruction.
bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 197bb3f..11b072e 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -157,6 +157,9 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
const int64_t TH = Imm & CPol::TH;
const int64_t Scope = Imm & CPol::SCOPE;
+ if (Imm & CPol::SCAL)
+ O << " scale_offset";
+
printTH(MI, TH, Scope, O);
printScope(Scope, O);
diff --git a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index 429ce0e0..a33dbfa 100644
--- a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -270,5 +270,6 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
MI.eraseFromParent();
}
}
+ finalizeBundles(MF);
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
index 2a3b42e..eff5b0a 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
@@ -138,7 +138,6 @@ void R600PassConfig::addPreSched2() {
void R600PassConfig::addPreEmitPass() {
addPass(createR600MachineCFGStructurizerPass());
addPass(createR600ExpandSpecialInstrsPass());
- addPass(&FinalizeMachineBundlesID);
addPass(createR600Packetizer());
addPass(createR600ControlFlowFinalizer());
}
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index d379088..40b8bcd 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -392,16 +392,20 @@ enum CPol {
TH_ATOMIC_CASCADE = 4, // Cascading vs regular
// Scope
- SCOPE = 0x3 << 3, // All Scope bits
- SCOPE_CU = 0 << 3,
- SCOPE_SE = 1 << 3,
- SCOPE_DEV = 2 << 3,
- SCOPE_SYS = 3 << 3,
+ SCOPE_SHIFT = 3,
+ SCOPE_MASK = 0x3,
+ SCOPE = SCOPE_MASK << SCOPE_SHIFT, // All Scope bits
+ SCOPE_CU = 0 << SCOPE_SHIFT,
+ SCOPE_SE = 1 << SCOPE_SHIFT,
+ SCOPE_DEV = 2 << SCOPE_SHIFT,
+ SCOPE_SYS = 3 << SCOPE_SHIFT,
NV = 1 << 5, // Non-volatile bit
SWZ = 1 << 6, // Swizzle bit
+ SCAL = 1 << 11, // Scale offset bit
+
ALL = TH | SCOPE,
// Helper bits
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index e172c0b..e5d1eaa 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1209,18 +1209,24 @@ void SIFoldOperandsImpl::foldOperand(
return;
}
- // A frame index will resolve to a positive constant, so it should always be
- // safe to fold the addressing mode, even pre-GFX9.
- UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI());
-
const unsigned Opc = UseMI->getOpcode();
if (TII->isFLATScratch(*UseMI) &&
AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
+ unsigned CPol =
+ TII->getNamedOperand(*UseMI, AMDGPU::OpName::cpol)->getImm();
+ if ((CPol & AMDGPU::CPol::SCAL) &&
+ !AMDGPU::supportsScaleOffset(*TII, NewOpc))
+ return;
+
UseMI->setDesc(TII->get(NewOpc));
}
+ // A frame index will resolve to a positive constant, so it should always be
+ // safe to fold the addressing mode, even pre-GFX9.
+ UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI());
+
return;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index bc0fd8d..74fe2b8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -874,13 +874,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
- if (Subtarget->hasScalarSMulU64())
+ if (Subtarget->hasVectorMulU64())
+ setOperationAction(ISD::MUL, MVT::i64, Legal);
+ else if (Subtarget->hasScalarSMulU64())
setOperationAction(ISD::MUL, MVT::i64, Custom);
if (Subtarget->hasMad64_32())
setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
- if (Subtarget->hasPrefetch() && Subtarget->hasSafeSmemPrefetch())
+ if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
if (Subtarget->hasIEEEMinimumMaximumInsts()) {
@@ -944,6 +946,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal);
}
+ if (Subtarget->hasBF16PackedInsts()) {
+ setOperationAction(
+ {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
+ MVT::v2bf16, Legal);
+ }
+
if (Subtarget->hasBF16TransInsts()) {
setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
}
@@ -1053,10 +1061,12 @@ ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const {
// where this is OK to use.
bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
EVT DestVT, EVT SrcVT) const {
- return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
- (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
- DestVT.getScalarType() == MVT::f32 &&
- SrcVT.getScalarType() == MVT::f16 &&
+ return DestVT.getScalarType() == MVT::f32 &&
+ ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
+ (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
+ SrcVT.getScalarType() == MVT::f16) ||
+ (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
+ SrcVT.getScalarType() == MVT::bf16)) &&
// TODO: This probably only requires no input flushing?
denormalModeIsFlushAllF32(DAG.getMachineFunction());
}
@@ -1540,7 +1550,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
}
- case Intrinsic::amdgcn_s_prefetch_data: {
+ case Intrinsic::amdgcn_s_prefetch_data:
+ case Intrinsic::amdgcn_flat_prefetch:
+ case Intrinsic::amdgcn_global_prefetch: {
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
Info.ptrVal = CI.getArgOperand(0);
@@ -4432,19 +4444,28 @@ SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
}
SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
- if (Op->isDivergent())
+ if (Op->isDivergent() &&
+ (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
+ // Cannot do I$ prefetch with divergent pointer.
return SDValue();
switch (cast<MemSDNode>(Op)->getAddressSpace()) {
case AMDGPUAS::FLAT_ADDRESS:
case AMDGPUAS::GLOBAL_ADDRESS:
case AMDGPUAS::CONSTANT_ADDRESS:
- case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
break;
+ case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
+ if (Subtarget->hasSafeSmemPrefetch())
+ break;
+ [[fallthrough]];
default:
return SDValue();
}
+ // I$ prefetch
+ if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
+ return SDValue();
+
return Op;
}
@@ -5415,6 +5436,19 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineOperand &Src0 = MI.getOperand(1);
MachineOperand &Src1 = MI.getOperand(2);
+ if (ST.hasAddSubU64Insts()) {
+ auto I = BuildMI(*BB, MI, DL,
+ TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
+ : AMDGPU::V_SUB_U64_e64),
+ Dest.getReg())
+ .add(Src0)
+ .add(Src1)
+ .addImm(0); // clamp
+ TII->legalizeOperands(*I);
+ MI.eraseFromParent();
+ return BB;
+ }
+
if (IsAdd && ST.hasLshlAddU64Inst()) {
auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
Dest.getReg())
@@ -13633,6 +13667,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_rsq_legacy:
case Intrinsic::amdgcn_trig_preop:
+ case Intrinsic::amdgcn_tanh:
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2:
case Intrinsic::amdgcn_sqrt:
@@ -14046,7 +14081,8 @@ static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
case ISD::FMAXIMUMNUM:
case AMDGPUISD::FMIN_LEGACY:
case AMDGPUISD::FMAX_LEGACY:
- return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
+ return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
+ (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
case ISD::FMINIMUM:
case ISD::FMAXIMUM:
return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 9faf497..dd3f2fe 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2108,8 +2108,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
assert(TII->isFLAT(MI));
- // All flat instructions use the VMEM counter.
- assert(TII->usesVM_CNT(MI));
+ // All flat instructions use the VMEM counter except prefetch.
+ if (!TII->usesVM_CNT(MI))
+ return false;
// If there are no memory operands then conservatively assume the flat
// operation may access VMEM.
@@ -2295,9 +2296,6 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
}
- // A Flat memory operation must access at least one address space.
- assert(FlatASCount);
-
// This is a flat memory operation that access both VMEM and LDS, so note it
// - it will require that both the VM and LGKM be flushed to zero if it is
// pending when a VM or LGKM dependency occurs.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 6b41934..89d9b0d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -318,6 +318,7 @@ def CPolBit {
int DLC = 2;
int SCC = 4;
int NV = 5;
+ int SCAL = 11;
}
class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index e2a2525..40e6871 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5482,6 +5482,19 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
+ if (CPol->getImm() & AMDGPU::CPol::SCAL) {
+ if (!ST.hasScaleOffset()) {
+ ErrInfo = "Subtarget does not support offset scaling";
+ return false;
+ }
+ if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
+ ErrInfo = "Instruction does not support offset scaling";
+ return false;
+ }
+ }
+ }
+
return true;
}
@@ -7348,6 +7361,10 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
case AMDGPU::S_MUL_U64:
+ if (ST.hasVectorMulU64()) {
+ NewOpcode = AMDGPU::V_MUL_U64_e64;
+ break;
+ }
// Split s_mul_u64 in 32-bit vector multiplications.
splitScalarSMulU64(Worklist, Inst, MDT);
Inst.eraseFromParent();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index bd4995b..b0be3f86 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1662,6 +1662,8 @@ def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;
def VOP3PMadMixModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixModsExt">;
def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
+def VOP3PMadMixBF16ModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16ModsExt">;
+def VOP3PMadMixBF16Mods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16Mods">;
def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">;
def VINTERPModsHi : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">;
@@ -2866,6 +2868,7 @@ def VOP_I16_I16_I16_ARITH : VOPProfile <[i16, i16, i16, untyped], /*EnableClamp=
def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
+def VOP_BF16_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, bf16, untyped]>;
def VOP_I32_I16_I16_I32 : VOPProfile <[i32, i16, i16, i32, untyped]>;
def VOP_I32_I16 : VOPProfile <[i32, i16, untyped, untyped]>;
@@ -2873,10 +2876,12 @@ def VOP_I16_I32 : VOPProfile <[i16, i32, untyped, untyped]>;
def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>;
def VOP_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, untyped]>;
+def VOP_V2BF16_V2BF16_V2BF16 : VOPProfile <[v2bf16, v2bf16, v2bf16, untyped]>;
def VOP_B32_F16_F16 : VOPProfile <[i32, f16, f16, untyped]>;
def VOP_V2F16_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, v2f16]>;
def VOP_V2I16_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, v2i16]>;
+def VOP_V2BF16_V2BF16_V2BF16_V2BF16 : VOPProfile <[v2bf16, v2bf16, v2bf16, v2bf16]>;
def VOP_V2I16_F32_F32 : VOPProfile <[v2i16, f32, f32, untyped]>;
def VOP_V2I16_I32_I32 : VOPProfile <[v2i16, i32, i32, untyped]>;
@@ -2912,8 +2917,10 @@ def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
def VOP_I16_F32_F32 : VOPProfile <[i16, f32, f32, untyped]>;
def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], /*EnableClamp=*/1>;
+def VOP_I64_I64_I64_ARITH : VOPProfile <[i64, i64, i64, untyped], /*EnableClamp=*/1>;
def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>;
+def VOP_F32_BF16_BF16_BF16 : VOPProfile <[f32, bf16, bf16, bf16]>;
def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>;
def VOP_V32F32_V6I32_F32 : VOPProfile <[v32f32, v6i32, f32, untyped]>;
def VOP_V32F16_V6I32_F32 : VOPProfile <[v32f16, v6i32, f32, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 5097ac03..b49c5a9 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -61,6 +61,7 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/InitializePasses.h"
@@ -1078,7 +1079,9 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
if (EltOffset0 + CI.Width != EltOffset1 &&
EltOffset1 + Paired.Width != EltOffset0)
return false;
- if (CI.CPol != Paired.CPol)
+ // Instructions with scale_offset modifier cannot be combined unless we
+ // also generate a code to scale the offset and reset that bit.
+ if (CI.CPol != Paired.CPol || (CI.CPol & AMDGPU::CPol::SCAL))
return false;
if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 3212060..0e8a420 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -704,16 +704,16 @@ void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning));
}
-/// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA.
-/// If this tag isn't present, or if it has no meaningful values, returns \p
-/// Default. Otherwise returns all the address spaces concerned by the MMRA.
-static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI,
- SIAtomicAddrSpace Default) {
- static constexpr StringLiteral FenceASPrefix = "amdgpu-as";
+/// Reads \p MI's MMRAs to parse the "amdgpu-synchronize-as" MMRA.
+/// If this tag isn't present, or if it has no meaningful values, returns
+/// \p none, otherwise returns the address spaces specified by the MD.
+static std::optional<SIAtomicAddrSpace>
+getSynchronizeAddrSpaceMD(const MachineInstr &MI) {
+ static constexpr StringLiteral FenceASPrefix = "amdgpu-synchronize-as";
auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
if (!MMRA)
- return Default;
+ return std::nullopt;
SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
for (const auto &[Prefix, Suffix] : MMRA) {
@@ -726,7 +726,10 @@ static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI,
diagnoseUnknownMMRAASName(MI, Suffix);
}
- return (Result != SIAtomicAddrSpace::NONE) ? Result : Default;
+ if (Result == SIAtomicAddrSpace::NONE)
+ return std::nullopt;
+
+ return Result;
}
} // end anonymous namespace
@@ -903,12 +906,19 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
*ScopeOrNone;
- if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
- ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
+ if (OrderingAddrSpace != SIAtomicAddrSpace::ATOMIC) {
+ // We currently expect refineOrderingAS to be the only place that
+ // can refine the AS ordered by the fence.
+ // If that changes, we need to review the semantics of that function
+ // in case it needs to preserve certain address spaces.
reportUnsupported(MI, "Unsupported atomic address space");
return std::nullopt;
}
+ auto SynchronizeAS = getSynchronizeAddrSpaceMD(*MI);
+ if (SynchronizeAS)
+ OrderingAddrSpace = *SynchronizeAS;
+
return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
}
@@ -2687,11 +2697,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
AtomicPseudoMIs.push_back(MI);
bool Changed = false;
- // Refine fenced address space based on MMRAs.
- //
- // TODO: Should we support this MMRA on other atomic operations?
- auto OrderingAddrSpace =
- getFenceAddrSpaceMMRA(*MI, MOI.getOrderingAddrSpace());
+ const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();
if (MOI.isAtomic()) {
const AtomicOrdering Order = MOI.getOrdering();
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index d8b52d2..4bda51d 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -856,16 +856,18 @@ def smrd_sextloadi16 : SMRDLoadPat<sextloadi16>;
def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
(prefetch node:$ptr, node:$rw, node:$loc, node:$type),
- [{ return !N->getOperand(1)->isDivergent();}]> {
+ [{ return !N->getOperand(1)->isDivergent() && Subtarget->hasSafeSmemPrefetch();}]> {
let GISelPredicateCode = [{
- return isInstrUniform(MI);
+ return isInstrUniform(MI) && Subtarget->hasSafeSmemPrefetch();
}];
}
def SMRDImm : ComplexPattern<iPTR, 2, "SelectSMRDImm">;
def SMRDImm32 : ComplexPattern<iPTR, 2, "SelectSMRDImm32">;
-def SMRDSgpr : ComplexPattern<iPTR, 2, "SelectSMRDSgpr">;
-def SMRDSgprImm : ComplexPattern<iPTR, 3, "SelectSMRDSgprImm">;
+let WantsRoot = true in {
+ def SMRDSgpr : ComplexPattern<iPTR, 3, "SelectSMRDSgpr", [], [], -3>;
+ def SMRDSgprImm : ComplexPattern<iPTR, 4, "SelectSMRDSgprImm", [], []>;
+}
def SMRDBufferImm : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">;
def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">;
def SMRDBufferSgprImm : ComplexPattern<iPTR, 2, "SelectSMRDBufferSgprImm">;
@@ -906,15 +908,15 @@ multiclass SMRD_Patterns <string Instr, ValueType vt, PatFrag frag,
let SubtargetPredicate = isNotGFX9Plus;
}
def : GCNPat <
- (frag (SMRDSgpr i64:$sbase, i32:$soffset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, 0))> {
+ (frag (SMRDSgpr i64:$sbase, i32:$soffset, CPol:$cpol)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, $cpol))> {
let SubtargetPredicate = isGFX9Plus;
}
// 4. SGPR+IMM offset
def : GCNPat <
- (frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, 0))> {
+ (frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset, CPol:$cpol)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, $cpol))> {
let SubtargetPredicate = isGFX9Plus;
}
@@ -989,15 +991,15 @@ multiclass ScalarLoadWithExtensionPat <string Instr, SDPatternOperator node, Val
// 2. SGPR offset
def : GCNPat <
- (node (SMRDSgpr i64:$sbase, i32:$soffset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))>{
+ (node (SMRDSgpr i64:$sbase, i32:$soffset, CPol:$cpol)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, $cpol))>{
let SubtargetPredicate = isGFX12Plus;
}
// 3. SGPR+IMM offset
def : GCNPat <
- (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))>{
+ (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset, CPol:$cpol)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, $cpol))>{
let SubtargetPredicate = isGFX12Plus;
}
@@ -1150,6 +1152,7 @@ multiclass SMPrefetchPat<string type, TImmLeaf cache_type> {
}
defm : SMPrefetchPat<"INST", i32imm_zero>;
+let AddedComplexity = 12 in // Prefer scalar prefetch over global for r/o case.
defm : SMPrefetchPat<"DATA", i32imm_one>;
let SubtargetPredicate = isGFX12Plus in {
@@ -1488,6 +1491,7 @@ class SMEM_Real_Load_gfx12<bits<6> op, string ps, string opName, OffsetMode offs
let Inst{20} = cpol{CPolBit.NV}; // non-volatile
let Inst{22-21} = cpol{4-3}; // scope
let Inst{24-23} = cpol{1-0}; // th - only lower 2 bits are supported
+ let Inst{56} = cpol{CPolBit.SCAL}; // scale offset
}
multiclass SM_Real_Loads_gfx12<bits<6> op, string ps = NAME> {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 9c6c374..b5b3cc9 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -3228,6 +3228,25 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
: getGfx9BufferFormatInfo(Format);
}
+bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode) {
+ uint64_t TSFlags = MII.get(Opcode).TSFlags;
+
+ if (TSFlags & SIInstrFlags::SMRD)
+ return !getSMEMIsBuffer(Opcode);
+ if (!(TSFlags & SIInstrFlags::FLAT))
+ return false;
+
+ // Only SV and SVS modes are supported.
+ if (TSFlags & SIInstrFlags::FlatScratch)
+ return hasNamedOperand(Opcode, OpName::vaddr);
+
+ // Only GVS mode is supported.
+ return hasNamedOperand(Opcode, OpName::vaddr) &&
+ hasNamedOperand(Opcode, OpName::saddr);
+
+ return false;
+}
+
bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc) {
for (auto OpName : {OpName::vdst, OpName::src0, OpName::src1, OpName::src2}) {
int Idx = getNamedOperandIdx(OpDesc.getOpcode(), OpName);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index bde951b..c09a9d6 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1757,6 +1757,9 @@ bool isIntrinsicSourceOfDivergence(unsigned IntrID);
/// \returns true if the intrinsic is uniform
bool isIntrinsicAlwaysUniform(unsigned IntrID);
+/// \returns true if a memory instruction supports scale_offset modifier.
+bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode);
+
/// \returns lds block size in terms of dwords. \p
/// This is used to calculate the lds size encoded for PAL metadata 3.0+ which
/// must be defined in terms of bytes.
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 030a6e1..550ec9d 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -925,6 +925,17 @@ let isAdd = 1 in {
defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32">;
}
+let isReMaterializable = 1 in {
+let SubtargetPredicate = HasAddSubU64Insts, SchedRW = [Write64Bit] in {
+defm V_ADD_U64 : VOP2Inst <"v_add_nc_u64", VOP_I64_I64_I64_ARITH>;
+// We don't actually have something like V_SUBREV_U64 so V_SUB_U64 can't be treated as commutable.
+let isCommutable = 0 in
+defm V_SUB_U64 : VOP2Inst <"v_sub_nc_u64", VOP_I64_I64_I64_ARITH>;
+} // End SubtargetPredicate = HasAddSubU64Insts, SchedRW = [Write64Bit]
+let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDouble] in
+defm V_MUL_U64 : VOP2Inst <"v_mul_u64", VOP_I64_I64_I64, DivergentBinFrag<mul>>;
+} // End isReMaterializable = 1
+
} // End isCommutable = 1
// These are special and do not read the exec mask.
@@ -1754,6 +1765,9 @@ multiclass VOP2_Real_FULL_with_name<GFXGen Gen, bits<6> op, string opName,
VOP2_Realtriple_e64_with_name<Gen, op, opName, asmName>,
VOP2_Real_NO_VOP3_with_name<Gen, op, opName, asmName>;
+multiclass VOP2_Real_NO_DPP<GFXGen Gen, bits<6> op> :
+ VOP2_Real_e32<Gen, op>, VOP2_Real_e64<Gen, op>;
+
multiclass VOP2_Real_NO_DPP_with_name<GFXGen Gen, bits<6> op, string opName,
string asmName> {
defm NAME : VOP2_Real_e32_with_name<Gen, op, opName, asmName>,
@@ -1843,6 +1857,9 @@ defm V_FMAC_F64 : VOP2_Real_FULL<GFX12Gen, 0x17>;
defm V_FMAMK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x23>;
defm V_FMAAK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x24>;
+defm V_ADD_U64 : VOP2_Real_FULL<GFX1250Gen, 0x28>;
+defm V_SUB_U64 : VOP2_Real_FULL<GFX1250Gen, 0x29>;
+defm V_MUL_U64 : VOP2_Real_NO_DPP<GFX1250Gen, 0x2a>;
//===----------------------------------------------------------------------===//
// GFX11.
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index aee2f2c..b6f9568 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1918,6 +1918,7 @@ let AssemblerPredicate = isGFX11Plus in {
// These instructions differ from GFX12 variant by supporting DPP:
defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250<0x252>;
+defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36d>;
//===----------------------------------------------------------------------===//
// GFX10.
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 9feea36..c812dc9 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -35,14 +35,18 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
bit useTiedOutput = 0> : VOP3P_Profile<P, Features, 1> {
bit UseTiedOutput = useTiedOutput;
+ defvar Src0RC = getVCSrcForVT<P.Src0VT>.ret;
+ defvar Src1RC = getVCSrcForVT<P.Src1VT>.ret;
+ defvar Src2RC = getVCSrcForVT<P.Src2VT>.ret;
+
dag srcs =
- (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0,
- FP16InputMods:$src1_modifiers, VCSrc_f16:$src1,
- FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
+ (ins FP16InputMods:$src0_modifiers, Src0RC:$src0,
+ FP16InputMods:$src1_modifiers, Src1RC:$src1,
+ FP16InputMods:$src2_modifiers, Src2RC:$src2);
dag dpp_srcs =
(ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0,
FPVRegInputMods:$src1_modifiers, VRegSrc_32:$src1,
- FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
+ FP16InputMods:$src2_modifiers, Src2RC:$src2);
// FIXME: Clamp0 misbehaves with the non-default vdst_in
// following it. For now workaround this by requiring clamp
@@ -144,48 +148,59 @@ def : VOP3PSatPat<usubsat, V_PK_SUB_U16>;
def : VOP3PSatPat<ssubsat, V_PK_SUB_I16>;
} // End SubtargetPredicate = HasVOP3PInsts
-let SubtargetPredicate = HasMinimum3Maximum3PKF16, FPDPRounding = 1 in {
+let isCommutable = 1, FPDPRounding = 1 in {
+let SubtargetPredicate = HasMin3Max3PKF16 in {
+defm V_PK_MIN3_NUM_F16 : VOP3PInst<"v_pk_min3_num_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmin3>;
+defm V_PK_MAX3_NUM_F16 : VOP3PInst<"v_pk_max3_num_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmax3>;
+}
+
+let SubtargetPredicate = HasMinimum3Maximum3PKF16 in {
defm V_PK_MINIMUM3_F16 : VOP3PInst<"v_pk_minimum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfminimum3>;
defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmaximum3>;
}
+} // End isCommutable = 1, FPDPRounding = 1
// TODO: Make sure we're doing the right thing with denormals. Note
// that FMA and MAD will differ.
multiclass MadFmaMixPats<SDPatternOperator fma_like,
Instruction mix_inst,
Instruction mixlo_inst,
- Instruction mixhi_inst> {
+ Instruction mixhi_inst,
+ ValueType VT = f16,
+ ValueType vecVT = v2f16> {
+ defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
+ defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt);
// At least one of the operands needs to be an fpextend of an f16
// for this to be worthwhile, so we need three patterns here.
// TODO: Could we use a predicate to inspect src1/2/3 instead?
def : GCNPat <
- (f32 (fma_like (f32 (VOP3PMadMixModsExt f16:$src0, i32:$src0_mods)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_mods)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_mods)))),
+ (f32 (fma_like (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_mods)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_mods)))),
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
DSTCLAMP.NONE)>;
def : GCNPat <
- (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)),
- (f32 (VOP3PMadMixModsExt f16:$src1, i32:$src1_mods)),
- (f32 (VOP3PMadMixMods f32:$src2, i32:$src2_mods)))),
+ (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsExtPat VT:$src1, i32:$src1_mods)),
+ (f32 (VOP3PMadMixModsPat f32:$src2, i32:$src2_mods)))),
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
DSTCLAMP.NONE)>;
def : GCNPat <
- (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)),
- (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_mods)),
- (f32 (VOP3PMadMixModsExt f16:$src2, i32:$src2_mods)))),
+ (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)),
+ (f32 (VOP3PMadMixModsExtPat VT:$src2, i32:$src2_mods)))),
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
DSTCLAMP.NONE)>;
def : GCNPat <
(AMDGPUclamp (build_vector
- (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))),
- (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))),
- (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0,
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$lo_src1, i32:$lo_src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$lo_src2, i32:$lo_src2_modifiers))))),
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$hi_src0, i32:$hi_src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$hi_src1, i32:$hi_src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$hi_src2, i32:$hi_src2_modifiers))))))),
+ (vecVT (mixhi_inst $hi_src0_modifiers, $hi_src0,
$hi_src1_modifiers, $hi_src1,
$hi_src2_modifiers, $hi_src2,
DSTCLAMP.ENABLE,
@@ -197,8 +212,8 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
>;
def : GCNPat <
- (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))),
+ (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))),
(mixlo_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
(i32 0), (i32 0),
@@ -207,9 +222,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
>;
def : GCNPat <
- (build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ (build_vector VT:$elt0, (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers)))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
(i32 0), (i32 0),
DSTCLAMP.NONE,
@@ -217,9 +232,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
>;
def : GCNPat <
- (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))),
(mixlo_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
@@ -234,10 +249,10 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
let True16Predicate = p in {
def : GCNPat <
- (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.NONE,
@@ -246,11 +261,11 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
def : GCNPat <
(build_vector
- f16:$elt0,
- (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ VT:$elt0,
+ (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.ENABLE,
@@ -261,38 +276,38 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
let True16Predicate = UseRealTrue16Insts in {
def : GCNPat <
- (build_vector (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), f16:$elt1),
- (v2f16 (mixlo_inst $src0_modifiers, $src0,
+ (build_vector (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), VT:$elt1),
+ (vecVT (mixlo_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.NONE,
- (REG_SEQUENCE VGPR_32, (f16 (IMPLICIT_DEF)), lo16, $elt1, hi16)))
+ (REG_SEQUENCE VGPR_32, (VT (IMPLICIT_DEF)), lo16, $elt1, hi16)))
>;
def : GCNPat <
- (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.NONE,
- (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16)))
+ (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16)))
>;
def : GCNPat <
(build_vector
- f16:$elt0,
- (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ VT:$elt0,
+ (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.ENABLE,
- (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16)))
+ (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16)))
>;
} // end True16Predicate
}
@@ -353,6 +368,67 @@ defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F
defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
}
+let SubtargetPredicate = HasFmaMixBF16Insts in {
+let isCommutable = 1 in {
+
+let isReMaterializable = 1 in
+defm V_FMA_MIX_F32_BF16 : VOP3_VOP3PInst<"v_fma_mix_f32_bf16", VOP3P_Mix_Profile<VOP_F32_BF16_BF16_BF16, VOP3_OPSEL>>;
+
+let FPDPRounding = 1 in {
+defm V_FMA_MIXLO_BF16 : VOP3_VOP3PInst<"v_fma_mixlo_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>;
+
+let ClampLo = 0, ClampHi = 1 in {
+defm V_FMA_MIXHI_BF16 : VOP3_VOP3PInst<"v_fma_mixhi_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>;
+}
+} // End FPDPRounding = 1
+} // End isCommutable = 1
+
+defm : MadFmaMixPats<fma, V_FMA_MIX_F32_BF16, V_FMA_MIXLO_BF16, V_FMA_MIXHI_BF16, bf16, v2bf16>;
+} // End SubtargetPredicate = HasFmaMixBF16Insts
+
+def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> {
+ let HasModifiers = 0;
+}
+
+let isCommutable = 1, isReMaterializable = 1 in {
+let SubtargetPredicate = HasPkAddMinMaxInsts in {
+defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile>;
+defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile>;
+defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile>;
+defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile>;
+}
+let SubtargetPredicate = HasPkMinMax3Insts in {
+defm V_PK_MAX3_I16 : VOP3PInst<"v_pk_max3_i16", PK_ADD_MINMAX_Profile>;
+defm V_PK_MAX3_U16 : VOP3PInst<"v_pk_max3_u16", PK_ADD_MINMAX_Profile>;
+defm V_PK_MIN3_I16 : VOP3PInst<"v_pk_min3_i16", PK_ADD_MINMAX_Profile>;
+defm V_PK_MIN3_U16 : VOP3PInst<"v_pk_min3_u16", PK_ADD_MINMAX_Profile>;
+}
+} // End isCommutable = 1, isReMaterializable = 1
+
+// TODO: Extend pattern to select op_sel and op_sel_hi.
+class ThreeOp_OpSelClampPats <SDPatternOperator op1, SDPatternOperator op2,
+ VOP3P_Pseudo inst,
+ ValueType vt = inst.Pfl.Src0VT,
+ RegisterOperand RC = getVCSrcForVT<vt>.ret> : GCNPat <
+ (ThreeOpFrag<op1, op2> vt:$src0, vt:$src1, vt:$src2),
+ (inst SRCMODS.OP_SEL_1, RC:$src0, SRCMODS.OP_SEL_1, RC:$src1,
+ SRCMODS.OP_SEL_1, RC:$src2, DSTCLAMP.NONE, 0)
+>;
+
+let SubtargetPredicate = HasPkAddMinMaxInsts in {
+def : ThreeOp_OpSelClampPats<add, smax, V_PK_ADD_MAX_I16>;
+def : ThreeOp_OpSelClampPats<add, umax, V_PK_ADD_MAX_U16>;
+def : ThreeOp_OpSelClampPats<add, smin, V_PK_ADD_MIN_I16>;
+def : ThreeOp_OpSelClampPats<add, umin, V_PK_ADD_MIN_U16>;
+}
+
+let SubtargetPredicate = HasPkMinMax3Insts in {
+def : ThreeOp_OpSelClampPats<smax, smax, V_PK_MAX3_I16>;
+def : ThreeOp_OpSelClampPats<umax, umax, V_PK_MAX3_U16>;
+def : ThreeOp_OpSelClampPats<smin, smin, V_PK_MIN3_I16>;
+def : ThreeOp_OpSelClampPats<umin, umin, V_PK_MIN3_U16>;
+}
+
// Defines patterns that extract signed 4bit from each Idx[0].
foreach Idx = [[0,28],[4,24],[8,20],[12,16],[16,12],[20,8],[24,4]] in
def ExtractSigned4bit_#Idx[0] : PatFrag<(ops node:$src),
@@ -1153,6 +1229,14 @@ let isCommutable = 1, isReMaterializable = 1 in {
let SubtargetPredicate = HasPkMovB32, isAsCheapAsAMove = 1 in
defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>;
+
+ let SubtargetPredicate = HasBF16PackedInsts in {
+ defm V_PK_ADD_BF16 : VOP3PInst<"v_pk_add_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fadd>;
+ defm V_PK_MUL_BF16 : VOP3PInst<"v_pk_mul_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fmul>;
+ defm V_PK_MIN_NUM_BF16 : VOP3PInst<"v_pk_min_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fminnum_like>;
+ defm V_PK_MAX_NUM_BF16 : VOP3PInst<"v_pk_max_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fmaxnum_like>;
+ defm V_PK_FMA_BF16 : VOP3PInst<"v_pk_fma_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fma>;
+ }
} // End isCommutable = 1, isReMaterializable = 1
def : AMDGPUMnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">;
@@ -2157,6 +2241,8 @@ multiclass VOP3P_Realtriple_gfx11_gfx12<bits<8> op>
multiclass VOP3P_Real_gfx12<bits<8> op> : VOP3P_Real_Base<GFX12Gen, op>;
+multiclass VOP3P_Real_gfx1250<bits<8> op> : VOP3P_Real_Base<GFX1250Gen, op>;
+
multiclass VOP3P_Real_with_name_gfx12<bits<8> op,
string backing_ps_name = NAME,
string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> :
@@ -2165,6 +2251,35 @@ multiclass VOP3P_Real_with_name_gfx12<bits<8> op,
defm V_PK_MIN_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1b, "V_PK_MIN_F16", "v_pk_min_num_f16">;
defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_max_num_f16">;
+defm V_PK_FMA_F32 : VOP3P_Real_gfx12<0x1f>;
+defm V_PK_MUL_F32 : VOP3P_Real_gfx12<0x28>;
+defm V_PK_ADD_F32 : VOP3P_Real_gfx12<0x29>;
+
+defm V_PK_ADD_MAX_I16 : VOP3P_Real_gfx1250<0x14>;
+defm V_PK_ADD_MAX_U16 : VOP3P_Real_gfx1250<0x15>;
+defm V_PK_ADD_MIN_I16 : VOP3P_Real_gfx1250<0x2d>;
+defm V_PK_ADD_MIN_U16 : VOP3P_Real_gfx1250<0x2e>;
+defm V_PK_MAX3_I16 : VOP3P_Real_gfx1250<0x2f>;
+defm V_PK_MAX3_U16 : VOP3P_Real_gfx1250<0x30>;
+defm V_PK_MIN3_I16 : VOP3P_Real_gfx1250<0x31>;
+defm V_PK_MIN3_U16 : VOP3P_Real_gfx1250<0x32>;
+defm V_PK_FMA_BF16 : VOP3P_Real_gfx1250<0x11>;
+defm V_PK_ADD_BF16 : VOP3P_Real_gfx1250<0x23>;
+defm V_PK_MUL_BF16 : VOP3P_Real_gfx1250<0x2a>;
+defm V_PK_MIN_NUM_BF16 : VOP3P_Real_gfx1250<0x2b>;
+defm V_PK_MAX_NUM_BF16 : VOP3P_Real_gfx1250<0x2c>;
+defm V_PK_MINIMUM3_F16 : VOP3P_Real_gfx1250<0x36>;
+defm V_PK_MAXIMUM3_F16 : VOP3P_Real_gfx1250<0x37>;
+defm V_PK_MIN3_NUM_F16 : VOP3P_Real_gfx1250<0x38>;
+defm V_PK_MAX3_NUM_F16 : VOP3P_Real_gfx1250<0x39>;
+
+defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3d>;
+defm V_FMA_MIXLO_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3e>;
+defm V_FMA_MIXHI_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3f>;
+
+let AssemblerPredicate = isGFX1250Plus in
+def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16", "v_fma_mix_f32">;
+
defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>;
defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index fd3b052..fca5dff 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -20347,6 +20347,13 @@ ARMTargetLowering::getSingleConstraintMatchWeight(
return weight;
}
+static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT) {
+ if (PR == 0 || VT == MVT::Other)
+ return false;
+ return (ARM::SPRRegClass.contains(PR) && VT != MVT::f32 && VT != MVT::i32) ||
+ (ARM::DPRRegClass.contains(PR) && VT != MVT::f64);
+}
+
using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
@@ -20420,7 +20427,10 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
if (StringRef("{cc}").equals_insensitive(Constraint))
return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
- return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+ auto RCP = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+ if (isIncompatibleReg(RCP.first, VT))
+ return {0, nullptr};
+ return RCP;
}
/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
@@ -21731,11 +21741,16 @@ bool ARMTargetLowering::lowerInterleavedLoad(
/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
-bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool ARMTargetLowering::lowerInterleavedStore(Instruction *Store,
+ Value *LaneMask,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
+ auto *SI = dyn_cast<StoreInst>(Store);
+ if (!SI)
+ return false;
+ assert(!LaneMask && "Unexpected mask on store");
auto *VecTy = cast<FixedVectorType>(SVI->getType());
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 9159f3d..825145d 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -685,7 +685,8 @@ class VectorType;
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
- bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ bool lowerInterleavedStore(Instruction *Store, Value *Mask,
+ ShuffleVectorInst *SVI,
unsigned Factor) const override;
bool shouldInsertFencesForAtomic(const Instruction *I) const override;
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
index 5963976..6ec78d0 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
@@ -7,12 +7,10 @@
//===----------------------------------------------------------------------===//
#include "AVRMCExpr.h"
-#include "MCTargetDesc/AVRMCAsmInfo.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCValue.h"
namespace llvm {
diff --git a/llvm/lib/Target/BPF/BPF.h b/llvm/lib/Target/BPF/BPF.h
index 5d49949..7faae8b 100644
--- a/llvm/lib/Target/BPF/BPF.h
+++ b/llvm/lib/Target/BPF/BPF.h
@@ -22,7 +22,7 @@ class BPFTargetMachine;
class InstructionSelector;
class PassRegistry;
-static const char *BPF_TRAP = "__bpf_trap";
+#define BPF_TRAP "__bpf_trap"
ModulePass *createBPFCheckAndAdjustIR();
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
index a0011e8..fa9007e 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
@@ -16,7 +16,6 @@
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index d9d9b36..feecfc0 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -301,41 +301,53 @@ bool DataScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) {
}
bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
- Value *PtrOperand = GEPI.getPointerOperand();
- Type *OrigGEPType = GEPI.getSourceElementType();
- Type *NewGEPType = OrigGEPType;
+ GEPOperator *GOp = cast<GEPOperator>(&GEPI);
+ Value *PtrOperand = GOp->getPointerOperand();
+ Type *NewGEPType = GOp->getSourceElementType();
bool NeedsTransform = false;
+ // Unwrap GEP ConstantExprs to find the base operand and element type
+ while (auto *CE = dyn_cast<ConstantExpr>(PtrOperand)) {
+ if (auto *GEPCE = dyn_cast<GEPOperator>(CE)) {
+ GOp = GEPCE;
+ PtrOperand = GEPCE->getPointerOperand();
+ NewGEPType = GEPCE->getSourceElementType();
+ } else
+ break;
+ }
+
if (GlobalVariable *NewGlobal = lookupReplacementGlobal(PtrOperand)) {
NewGEPType = NewGlobal->getValueType();
PtrOperand = NewGlobal;
NeedsTransform = true;
} else if (AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrOperand)) {
Type *AllocatedType = Alloca->getAllocatedType();
- // Only transform if the allocated type is an array
- if (AllocatedType != OrigGEPType && isa<ArrayType>(AllocatedType)) {
+ if (isa<ArrayType>(AllocatedType) &&
+ AllocatedType != GOp->getResultElementType()) {
NewGEPType = AllocatedType;
NeedsTransform = true;
}
}
- // Scalar geps should remain scalars geps. The dxil-flatten-arrays pass will
- // convert these scalar geps into flattened array geps
- if (!isa<ArrayType>(OrigGEPType))
- NewGEPType = OrigGEPType;
-
- // Note: We bail if this isn't a gep touched via alloca or global
- // transformations
if (!NeedsTransform)
return false;
- IRBuilder<> Builder(&GEPI);
- SmallVector<Value *, MaxVecSize> Indices(GEPI.indices());
+ // Keep scalar GEPs scalar; dxil-flatten-arrays will do flattening later
+ if (!isa<ArrayType>(GOp->getSourceElementType()))
+ NewGEPType = GOp->getSourceElementType();
+ IRBuilder<> Builder(&GEPI);
+ SmallVector<Value *, MaxVecSize> Indices(GOp->indices());
Value *NewGEP = Builder.CreateGEP(NewGEPType, PtrOperand, Indices,
- GEPI.getName(), GEPI.getNoWrapFlags());
- GEPI.replaceAllUsesWith(NewGEP);
- GEPI.eraseFromParent();
+ GOp->getName(), GOp->getNoWrapFlags());
+
+ GOp->replaceAllUsesWith(NewGEP);
+
+ if (auto *CE = dyn_cast<ConstantExpr>(GOp))
+ CE->destroyConstant();
+ else if (auto *OldGEPI = dyn_cast<GetElementPtrInst>(GOp))
+ OldGEPI->eraseFromParent();
+
return true;
}
diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp
index 703a9e5..c8866bf 100644
--- a/llvm/lib/Target/DirectX/DXILPrepare.cpp
+++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp
@@ -24,7 +24,6 @@
#include "llvm/IR/AttributeMask.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
-#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
@@ -240,11 +239,6 @@ public:
for (size_t Idx = 0, End = F.arg_size(); Idx < End; ++Idx)
F.removeParamAttrs(Idx, AttrMask);
- // Lifetime intrinsics in LLVM 3.7 do not have the memory FnAttr
- if (Intrinsic::ID IID = F.getIntrinsicID();
- IID == Intrinsic::lifetime_start || IID == Intrinsic::lifetime_end)
- F.removeFnAttr(Attribute::Memory);
-
for (auto &BB : F) {
IRBuilder<> Builder(&BB);
for (auto &I : make_early_inc_range(BB)) {
@@ -253,7 +247,7 @@ public:
// Emtting NoOp bitcast instructions allows the ValueEnumerator to be
// unmodified as it reserves instruction IDs during contruction.
- if (auto *LI = dyn_cast<LoadInst>(&I)) {
+ if (auto LI = dyn_cast<LoadInst>(&I)) {
if (Value *NoOpBitcast = maybeGenerateBitcast(
Builder, PointerTypes, I, LI->getPointerOperand(),
LI->getType())) {
@@ -263,7 +257,7 @@ public:
}
continue;
}
- if (auto *SI = dyn_cast<StoreInst>(&I)) {
+ if (auto SI = dyn_cast<StoreInst>(&I)) {
if (Value *NoOpBitcast = maybeGenerateBitcast(
Builder, PointerTypes, I, SI->getPointerOperand(),
SI->getValueOperand()->getType())) {
@@ -274,7 +268,7 @@ public:
}
continue;
}
- if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+ if (auto GEP = dyn_cast<GetElementPtrInst>(&I)) {
if (Value *NoOpBitcast = maybeGenerateBitcast(
Builder, PointerTypes, I, GEP->getPointerOperand(),
GEP->getSourceElementType()))
@@ -286,17 +280,6 @@ public:
CB->removeRetAttrs(AttrMask);
for (size_t Idx = 0, End = CB->arg_size(); Idx < End; ++Idx)
CB->removeParamAttrs(Idx, AttrMask);
- // LLVM 3.7 Lifetime intrinics require an i8* pointer operand, so we
- // insert a bitcast here to ensure that is the case
- if (isa<LifetimeIntrinsic>(CB)) {
- Value *PtrOperand = CB->getArgOperand(1);
- Builder.SetInsertPoint(CB);
- PointerType *PtrTy = cast<PointerType>(PtrOperand->getType());
- Value *NoOpBitcast = Builder.Insert(
- CastInst::Create(Instruction::BitCast, PtrOperand,
- Builder.getPtrTy(PtrTy->getAddressSpace())));
- CB->setArgOperand(1, NoOpBitcast);
- }
continue;
}
}
diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
index dfc8162..ebdfcaa 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
@@ -16,6 +16,7 @@
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/DXILMetadataAnalysis.h"
#include "llvm/BinaryFormat/DXContainer.h"
+#include "llvm/Frontend/HLSL/RootSignatureMetadata.h"
#include "llvm/Frontend/HLSL/RootSignatureValidations.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DiagnosticInfo.h"
@@ -29,25 +30,10 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include <cstdint>
-#include <optional>
-#include <utility>
using namespace llvm;
using namespace llvm::dxil;
-static bool reportError(LLVMContext *Ctx, Twine Message,
- DiagnosticSeverity Severity = DS_Error) {
- Ctx->diagnose(DiagnosticInfoGeneric(Message, Severity));
- return true;
-}
-
-static bool reportValueError(LLVMContext *Ctx, Twine ParamName,
- uint32_t Value) {
- Ctx->diagnose(DiagnosticInfoGeneric(
- "Invalid value for " + ParamName + ": " + Twine(Value), DS_Error));
- return true;
-}
-
static std::optional<uint32_t> extractMdIntValue(MDNode *Node,
unsigned int OpId) {
if (auto *CI =
@@ -56,453 +42,10 @@ static std::optional<uint32_t> extractMdIntValue(MDNode *Node,
return std::nullopt;
}
-static std::optional<float> extractMdFloatValue(MDNode *Node,
- unsigned int OpId) {
- if (auto *CI = mdconst::dyn_extract<ConstantFP>(Node->getOperand(OpId).get()))
- return CI->getValueAPF().convertToFloat();
- return std::nullopt;
-}
-
-static std::optional<StringRef> extractMdStringValue(MDNode *Node,
- unsigned int OpId) {
- MDString *NodeText = dyn_cast<MDString>(Node->getOperand(OpId));
- if (NodeText == nullptr)
- return std::nullopt;
- return NodeText->getString();
-}
-
-static bool parseRootFlags(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
- MDNode *RootFlagNode) {
-
- if (RootFlagNode->getNumOperands() != 2)
- return reportError(Ctx, "Invalid format for RootFlag Element");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootFlagNode, 1))
- RSD.Flags = *Val;
- else
- return reportError(Ctx, "Invalid value for RootFlag");
-
- return false;
-}
-
-static bool parseRootConstants(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
- MDNode *RootConstantNode) {
-
- if (RootConstantNode->getNumOperands() != 5)
- return reportError(Ctx, "Invalid format for RootConstants Element");
-
- dxbc::RTS0::v1::RootParameterHeader Header;
- // The parameter offset doesn't matter here - we recalculate it during
- // serialization Header.ParameterOffset = 0;
- Header.ParameterType =
- llvm::to_underlying(dxbc::RootParameterType::Constants32Bit);
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 1))
- Header.ShaderVisibility = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderVisibility");
-
- dxbc::RTS0::v1::RootConstants Constants;
- if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 2))
- Constants.ShaderRegister = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderRegister");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 3))
- Constants.RegisterSpace = *Val;
- else
- return reportError(Ctx, "Invalid value for RegisterSpace");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 4))
- Constants.Num32BitValues = *Val;
- else
- return reportError(Ctx, "Invalid value for Num32BitValues");
-
- RSD.ParametersContainer.addParameter(Header, Constants);
-
- return false;
-}
-
-static bool parseRootDescriptors(LLVMContext *Ctx,
- mcdxbc::RootSignatureDesc &RSD,
- MDNode *RootDescriptorNode,
- RootSignatureElementKind ElementKind) {
- assert(ElementKind == RootSignatureElementKind::SRV ||
- ElementKind == RootSignatureElementKind::UAV ||
- ElementKind == RootSignatureElementKind::CBV &&
- "parseRootDescriptors should only be called with RootDescriptor "
- "element kind.");
- if (RootDescriptorNode->getNumOperands() != 5)
- return reportError(Ctx, "Invalid format for Root Descriptor Element");
-
- dxbc::RTS0::v1::RootParameterHeader Header;
- switch (ElementKind) {
- case RootSignatureElementKind::SRV:
- Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::SRV);
- break;
- case RootSignatureElementKind::UAV:
- Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::UAV);
- break;
- case RootSignatureElementKind::CBV:
- Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::CBV);
- break;
- default:
- llvm_unreachable("invalid Root Descriptor kind");
- break;
- }
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 1))
- Header.ShaderVisibility = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderVisibility");
-
- dxbc::RTS0::v2::RootDescriptor Descriptor;
- if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 2))
- Descriptor.ShaderRegister = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderRegister");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 3))
- Descriptor.RegisterSpace = *Val;
- else
- return reportError(Ctx, "Invalid value for RegisterSpace");
-
- if (RSD.Version == 1) {
- RSD.ParametersContainer.addParameter(Header, Descriptor);
- return false;
- }
- assert(RSD.Version > 1);
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 4))
- Descriptor.Flags = *Val;
- else
- return reportError(Ctx, "Invalid value for Root Descriptor Flags");
-
- RSD.ParametersContainer.addParameter(Header, Descriptor);
- return false;
-}
-
-static bool parseDescriptorRange(LLVMContext *Ctx,
- mcdxbc::DescriptorTable &Table,
- MDNode *RangeDescriptorNode) {
-
- if (RangeDescriptorNode->getNumOperands() != 6)
- return reportError(Ctx, "Invalid format for Descriptor Range");
-
- dxbc::RTS0::v2::DescriptorRange Range;
-
- std::optional<StringRef> ElementText =
- extractMdStringValue(RangeDescriptorNode, 0);
-
- if (!ElementText.has_value())
- return reportError(Ctx, "Descriptor Range, first element is not a string.");
-
- Range.RangeType =
- StringSwitch<uint32_t>(*ElementText)
- .Case("CBV", llvm::to_underlying(dxbc::DescriptorRangeType::CBV))
- .Case("SRV", llvm::to_underlying(dxbc::DescriptorRangeType::SRV))
- .Case("UAV", llvm::to_underlying(dxbc::DescriptorRangeType::UAV))
- .Case("Sampler",
- llvm::to_underlying(dxbc::DescriptorRangeType::Sampler))
- .Default(~0U);
-
- if (Range.RangeType == ~0U)
- return reportError(Ctx, "Invalid Descriptor Range type: " + *ElementText);
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 1))
- Range.NumDescriptors = *Val;
- else
- return reportError(Ctx, "Invalid value for Number of Descriptor in Range");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 2))
- Range.BaseShaderRegister = *Val;
- else
- return reportError(Ctx, "Invalid value for BaseShaderRegister");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 3))
- Range.RegisterSpace = *Val;
- else
- return reportError(Ctx, "Invalid value for RegisterSpace");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 4))
- Range.OffsetInDescriptorsFromTableStart = *Val;
- else
- return reportError(Ctx,
- "Invalid value for OffsetInDescriptorsFromTableStart");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 5))
- Range.Flags = *Val;
- else
- return reportError(Ctx, "Invalid value for Descriptor Range Flags");
-
- Table.Ranges.push_back(Range);
- return false;
-}
-
-static bool parseDescriptorTable(LLVMContext *Ctx,
- mcdxbc::RootSignatureDesc &RSD,
- MDNode *DescriptorTableNode) {
- const unsigned int NumOperands = DescriptorTableNode->getNumOperands();
- if (NumOperands < 2)
- return reportError(Ctx, "Invalid format for Descriptor Table");
-
- dxbc::RTS0::v1::RootParameterHeader Header;
- if (std::optional<uint32_t> Val = extractMdIntValue(DescriptorTableNode, 1))
- Header.ShaderVisibility = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderVisibility");
-
- mcdxbc::DescriptorTable Table;
- Header.ParameterType =
- llvm::to_underlying(dxbc::RootParameterType::DescriptorTable);
-
- for (unsigned int I = 2; I < NumOperands; I++) {
- MDNode *Element = dyn_cast<MDNode>(DescriptorTableNode->getOperand(I));
- if (Element == nullptr)
- return reportError(Ctx, "Missing Root Element Metadata Node.");
-
- if (parseDescriptorRange(Ctx, Table, Element))
- return true;
- }
-
- RSD.ParametersContainer.addParameter(Header, Table);
- return false;
-}
-
-static bool parseStaticSampler(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
- MDNode *StaticSamplerNode) {
- if (StaticSamplerNode->getNumOperands() != 14)
- return reportError(Ctx, "Invalid format for Static Sampler");
-
- dxbc::RTS0::v1::StaticSampler Sampler;
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 1))
- Sampler.Filter = *Val;
- else
- return reportError(Ctx, "Invalid value for Filter");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 2))
- Sampler.AddressU = *Val;
- else
- return reportError(Ctx, "Invalid value for AddressU");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 3))
- Sampler.AddressV = *Val;
- else
- return reportError(Ctx, "Invalid value for AddressV");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 4))
- Sampler.AddressW = *Val;
- else
- return reportError(Ctx, "Invalid value for AddressW");
-
- if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 5))
- Sampler.MipLODBias = *Val;
- else
- return reportError(Ctx, "Invalid value for MipLODBias");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 6))
- Sampler.MaxAnisotropy = *Val;
- else
- return reportError(Ctx, "Invalid value for MaxAnisotropy");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 7))
- Sampler.ComparisonFunc = *Val;
- else
- return reportError(Ctx, "Invalid value for ComparisonFunc ");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 8))
- Sampler.BorderColor = *Val;
- else
- return reportError(Ctx, "Invalid value for ComparisonFunc ");
-
- if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 9))
- Sampler.MinLOD = *Val;
- else
- return reportError(Ctx, "Invalid value for MinLOD");
-
- if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 10))
- Sampler.MaxLOD = *Val;
- else
- return reportError(Ctx, "Invalid value for MaxLOD");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 11))
- Sampler.ShaderRegister = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderRegister");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 12))
- Sampler.RegisterSpace = *Val;
- else
- return reportError(Ctx, "Invalid value for RegisterSpace");
-
- if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 13))
- Sampler.ShaderVisibility = *Val;
- else
- return reportError(Ctx, "Invalid value for ShaderVisibility");
-
- RSD.StaticSamplers.push_back(Sampler);
- return false;
-}
-
-static bool parseRootSignatureElement(LLVMContext *Ctx,
- mcdxbc::RootSignatureDesc &RSD,
- MDNode *Element) {
- std::optional<StringRef> ElementText = extractMdStringValue(Element, 0);
- if (!ElementText.has_value())
- return reportError(Ctx, "Invalid format for Root Element");
-
- RootSignatureElementKind ElementKind =
- StringSwitch<RootSignatureElementKind>(*ElementText)
- .Case("RootFlags", RootSignatureElementKind::RootFlags)
- .Case("RootConstants", RootSignatureElementKind::RootConstants)
- .Case("RootCBV", RootSignatureElementKind::CBV)
- .Case("RootSRV", RootSignatureElementKind::SRV)
- .Case("RootUAV", RootSignatureElementKind::UAV)
- .Case("DescriptorTable", RootSignatureElementKind::DescriptorTable)
- .Case("StaticSampler", RootSignatureElementKind::StaticSamplers)
- .Default(RootSignatureElementKind::Error);
-
- switch (ElementKind) {
-
- case RootSignatureElementKind::RootFlags:
- return parseRootFlags(Ctx, RSD, Element);
- case RootSignatureElementKind::RootConstants:
- return parseRootConstants(Ctx, RSD, Element);
- case RootSignatureElementKind::CBV:
- case RootSignatureElementKind::SRV:
- case RootSignatureElementKind::UAV:
- return parseRootDescriptors(Ctx, RSD, Element, ElementKind);
- case RootSignatureElementKind::DescriptorTable:
- return parseDescriptorTable(Ctx, RSD, Element);
- case RootSignatureElementKind::StaticSamplers:
- return parseStaticSampler(Ctx, RSD, Element);
- case RootSignatureElementKind::Error:
- return reportError(Ctx, "Invalid Root Signature Element: " + *ElementText);
- }
-
- llvm_unreachable("Unhandled RootSignatureElementKind enum.");
-}
-
-static bool parse(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
- MDNode *Node) {
- bool HasError = false;
-
- // Loop through the Root Elements of the root signature.
- for (const auto &Operand : Node->operands()) {
- MDNode *Element = dyn_cast<MDNode>(Operand);
- if (Element == nullptr)
- return reportError(Ctx, "Missing Root Element Metadata Node.");
-
- HasError = HasError || parseRootSignatureElement(Ctx, RSD, Element);
- }
-
- return HasError;
-}
-
-static bool validate(LLVMContext *Ctx, const mcdxbc::RootSignatureDesc &RSD) {
-
- if (!llvm::hlsl::rootsig::verifyVersion(RSD.Version)) {
- return reportValueError(Ctx, "Version", RSD.Version);
- }
-
- if (!llvm::hlsl::rootsig::verifyRootFlag(RSD.Flags)) {
- return reportValueError(Ctx, "RootFlags", RSD.Flags);
- }
-
- for (const mcdxbc::RootParameterInfo &Info : RSD.ParametersContainer) {
- if (!dxbc::isValidShaderVisibility(Info.Header.ShaderVisibility))
- return reportValueError(Ctx, "ShaderVisibility",
- Info.Header.ShaderVisibility);
-
- assert(dxbc::isValidParameterType(Info.Header.ParameterType) &&
- "Invalid value for ParameterType");
-
- switch (Info.Header.ParameterType) {
-
- case llvm::to_underlying(dxbc::RootParameterType::CBV):
- case llvm::to_underlying(dxbc::RootParameterType::UAV):
- case llvm::to_underlying(dxbc::RootParameterType::SRV): {
- const dxbc::RTS0::v2::RootDescriptor &Descriptor =
- RSD.ParametersContainer.getRootDescriptor(Info.Location);
- if (!llvm::hlsl::rootsig::verifyRegisterValue(Descriptor.ShaderRegister))
- return reportValueError(Ctx, "ShaderRegister",
- Descriptor.ShaderRegister);
-
- if (!llvm::hlsl::rootsig::verifyRegisterSpace(Descriptor.RegisterSpace))
- return reportValueError(Ctx, "RegisterSpace", Descriptor.RegisterSpace);
-
- if (RSD.Version > 1) {
- if (!llvm::hlsl::rootsig::verifyRootDescriptorFlag(RSD.Version,
- Descriptor.Flags))
- return reportValueError(Ctx, "RootDescriptorFlag", Descriptor.Flags);
- }
- break;
- }
- case llvm::to_underlying(dxbc::RootParameterType::DescriptorTable): {
- const mcdxbc::DescriptorTable &Table =
- RSD.ParametersContainer.getDescriptorTable(Info.Location);
- for (const dxbc::RTS0::v2::DescriptorRange &Range : Table) {
- if (!llvm::hlsl::rootsig::verifyRangeType(Range.RangeType))
- return reportValueError(Ctx, "RangeType", Range.RangeType);
-
- if (!llvm::hlsl::rootsig::verifyRegisterSpace(Range.RegisterSpace))
- return reportValueError(Ctx, "RegisterSpace", Range.RegisterSpace);
-
- if (!llvm::hlsl::rootsig::verifyNumDescriptors(Range.NumDescriptors))
- return reportValueError(Ctx, "NumDescriptors", Range.NumDescriptors);
-
- if (!llvm::hlsl::rootsig::verifyDescriptorRangeFlag(
- RSD.Version, Range.RangeType, Range.Flags))
- return reportValueError(Ctx, "DescriptorFlag", Range.Flags);
- }
- break;
- }
- }
- }
-
- for (const dxbc::RTS0::v1::StaticSampler &Sampler : RSD.StaticSamplers) {
- if (!llvm::hlsl::rootsig::verifySamplerFilter(Sampler.Filter))
- return reportValueError(Ctx, "Filter", Sampler.Filter);
-
- if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressU))
- return reportValueError(Ctx, "AddressU", Sampler.AddressU);
-
- if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressV))
- return reportValueError(Ctx, "AddressV", Sampler.AddressV);
-
- if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressW))
- return reportValueError(Ctx, "AddressW", Sampler.AddressW);
-
- if (!llvm::hlsl::rootsig::verifyMipLODBias(Sampler.MipLODBias))
- return reportValueError(Ctx, "MipLODBias", Sampler.MipLODBias);
-
- if (!llvm::hlsl::rootsig::verifyMaxAnisotropy(Sampler.MaxAnisotropy))
- return reportValueError(Ctx, "MaxAnisotropy", Sampler.MaxAnisotropy);
-
- if (!llvm::hlsl::rootsig::verifyComparisonFunc(Sampler.ComparisonFunc))
- return reportValueError(Ctx, "ComparisonFunc", Sampler.ComparisonFunc);
-
- if (!llvm::hlsl::rootsig::verifyBorderColor(Sampler.BorderColor))
- return reportValueError(Ctx, "BorderColor", Sampler.BorderColor);
-
- if (!llvm::hlsl::rootsig::verifyLOD(Sampler.MinLOD))
- return reportValueError(Ctx, "MinLOD", Sampler.MinLOD);
-
- if (!llvm::hlsl::rootsig::verifyLOD(Sampler.MaxLOD))
- return reportValueError(Ctx, "MaxLOD", Sampler.MaxLOD);
-
- if (!llvm::hlsl::rootsig::verifyRegisterValue(Sampler.ShaderRegister))
- return reportValueError(Ctx, "ShaderRegister", Sampler.ShaderRegister);
-
- if (!llvm::hlsl::rootsig::verifyRegisterSpace(Sampler.RegisterSpace))
- return reportValueError(Ctx, "RegisterSpace", Sampler.RegisterSpace);
-
- if (!dxbc::isValidShaderVisibility(Sampler.ShaderVisibility))
- return reportValueError(Ctx, "ShaderVisibility",
- Sampler.ShaderVisibility);
- }
-
- return false;
+static bool reportError(LLVMContext *Ctx, Twine Message,
+ DiagnosticSeverity Severity = DS_Error) {
+ Ctx->diagnose(DiagnosticInfoGeneric(Message, Severity));
+ return true;
}
static SmallDenseMap<const Function *, mcdxbc::RootSignatureDesc>
@@ -584,7 +127,9 @@ analyzeModule(Module &M) {
// static sampler offset is calculated when writting dxcontainer.
RSD.StaticSamplersOffset = 0u;
- if (parse(Ctx, RSD, RootElementListNode) || validate(Ctx, RSD)) {
+ hlsl::rootsig::MetadataParser MDParser(RootElementListNode);
+
+ if (MDParser.ParseRootSignature(Ctx, RSD)) {
return RSDMap;
}
diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h
index fc39b38..254b7ff 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.h
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.h
@@ -26,17 +26,6 @@
namespace llvm {
namespace dxil {
-enum class RootSignatureElementKind {
- Error = 0,
- RootFlags = 1,
- RootConstants = 2,
- SRV = 3,
- UAV = 4,
- CBV = 5,
- DescriptorTable = 6,
- StaticSamplers = 7
-};
-
class RootSignatureBindingInfo {
private:
SmallDenseMap<const Function *, mcdxbc::RootSignatureDesc> FuncToRsMap;
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index 46d5d71..1d79c30 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -2545,25 +2545,6 @@ void DXILBitcodeWriter::writeInstruction(const Instruction &I, unsigned InstID,
Vals.clear();
}
-// HLSL Change
-namespace {
-struct ValueNameCreator {
- MallocAllocator Allocator;
- SmallVector<ValueName *, 2>
- ValueNames; // SmallVector N = 2 because we currently only expect this
- // to hold ValueNames for Lifetime intrinsics
- ~ValueNameCreator() {
- for (auto *VN : ValueNames)
- VN->Destroy(Allocator);
- }
- ValueName *create(StringRef Name, Value *V) {
- ValueName *VN = ValueName::create(Name, Allocator, V);
- ValueNames.push_back(VN);
- return VN;
- }
-};
-} // anonymous namespace
-
// Emit names for globals/functions etc.
void DXILBitcodeWriter::writeFunctionLevelValueSymbolTable(
const ValueSymbolTable &VST) {
@@ -2578,24 +2559,9 @@ void DXILBitcodeWriter::writeFunctionLevelValueSymbolTable(
// to ensure the binary is the same no matter what values ever existed.
SmallVector<const ValueName *, 16> SortedTable;
- // HLSL Change
- ValueNameCreator VNC;
for (auto &VI : VST) {
- ValueName *VN = VI.second->getValueName();
- // Clang mangles lifetime intrinsic names by appending '.p0' to the end,
- // making them invalid lifetime intrinsics in LLVM 3.7. We can't
- // demangle in dxil-prepare because it would result in invalid IR.
- // Therefore we have to do this in the bitcode writer while writing its
- // name to the symbol table.
- if (const Function *Fn = dyn_cast<Function>(VI.getValue());
- Fn && Fn->isIntrinsic()) {
- Intrinsic::ID IID = Fn->getIntrinsicID();
- if (IID == Intrinsic::lifetime_start || IID == Intrinsic::lifetime_end)
- VN = VNC.create(Intrinsic::getBaseName(IID), VI.second);
- }
- SortedTable.push_back(VN);
+ SortedTable.push_back(VI.second->getValueName());
}
-
// The keys are unique, so there shouldn't be stability issues.
llvm::sort(SortedTable, [](const ValueName *A, const ValueName *B) {
return A->first() < B->first();
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
index dfc79039c..1bd5dd7 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
@@ -17,6 +17,7 @@
#include "llvm/Analysis/ModuleSummaryAnalysis.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/InitializePasses.h"
@@ -52,6 +53,53 @@ public:
}
};
+static void legalizeLifetimeIntrinsics(Module &M) {
+ for (Function &F : M) {
+ Intrinsic::ID IID = F.getIntrinsicID();
+ if (IID != Intrinsic::lifetime_start && IID != Intrinsic::lifetime_end)
+ continue;
+
+ // Lifetime intrinsics in LLVM 3.7 do not have the memory FnAttr
+ F.removeFnAttr(Attribute::Memory);
+
+ // Lifetime intrinsics in LLVM 3.7 do not have mangled names
+ F.setName(Intrinsic::getBaseName(IID));
+
+ // LLVM 3.7 Lifetime intrinics require an i8* operand, so we insert bitcasts
+ // to ensure that is the case
+ for (auto *User : make_early_inc_range(F.users())) {
+ CallInst *CI = dyn_cast<CallInst>(User);
+ assert(CI && "Expected user of a lifetime intrinsic function to be a "
+ "lifetime intrinsic call");
+ Value *PtrOperand = CI->getArgOperand(1);
+ PointerType *PtrTy = cast<PointerType>(PtrOperand->getType());
+ Value *NoOpBitCast = CastInst::Create(Instruction::BitCast, PtrOperand,
+ PtrTy, "", CI->getIterator());
+ CI->setArgOperand(1, NoOpBitCast);
+ }
+ }
+}
+
+static void removeLifetimeIntrinsics(Module &M) {
+ for (Function &F : make_early_inc_range(M)) {
+ if (Intrinsic::ID IID = F.getIntrinsicID();
+ IID != Intrinsic::lifetime_start && IID != Intrinsic::lifetime_end)
+ continue;
+
+ for (User *U : make_early_inc_range(F.users())) {
+ LifetimeIntrinsic *LI = dyn_cast<LifetimeIntrinsic>(U);
+ assert(LI && "Expected user of lifetime intrinsic function to be "
+ "a LifetimeIntrinsic instruction");
+ BitCastInst *BCI = dyn_cast<BitCastInst>(LI->getArgOperand(1));
+ assert(BCI && "Expected pointer operand of LifetimeIntrinsic to be a "
+ "BitCastInst");
+ LI->eraseFromParent();
+ BCI->eraseFromParent();
+ }
+ F.eraseFromParent();
+ }
+}
+
class EmbedDXILPass : public llvm::ModulePass {
public:
static char ID; // Pass identification, replacement for typeid
@@ -70,8 +118,17 @@ public:
// Only the output bitcode need to be DXIL triple.
M.setTargetTriple(Triple("dxil-ms-dx"));
+ // Perform late legalization of lifetime intrinsics that would otherwise
+ // fail the Module Verifier if performed in an earlier pass
+ legalizeLifetimeIntrinsics(M);
+
WriteDXILToFile(M, OS);
+ // We no longer need lifetime intrinsics after bitcode serialization, so we
+ // simply remove them to keep the Module Verifier happy after our
+ // not-so-legal legalizations
+ removeLifetimeIntrinsics(M);
+
// Recover triple.
M.setTargetTriple(OriginalTriple);
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrFormats.td b/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
index f0ca908..6050649 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -336,5 +336,4 @@ class InstDuplex<bits<4> iClass, string cstr = ""> : Instruction,
// Instruction Classes Definitions -
//===----------------------------------------------------------------------===//
-include "HexagonInstrFormatsV60.td"
include "HexagonInstrFormatsV65.td"
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td b/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td
deleted file mode 100644
index 86a8218..0000000
--- a/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td
+++ /dev/null
@@ -1,21 +0,0 @@
-//==- HexagonInstrFormatsV60.td - Hexagon Instruction Formats -*- tablegen -==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon V60 instruction classes in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-//----------------------------------------------------------------------------//
-// Instruction Classes Definitions +
-//----------------------------------------------------------------------------//
-
-class CVI_VA_Resource<dag outs, dag ins, string asmstr,
- list<dag> pattern = [], string cstr = "",
- InstrItinClass itin = CVI_VA>
- : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>,
- OpcodeHexagon, Requires<[HasV60, UseHVX]>;
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrFormatsV65.td b/llvm/lib/Target/Hexagon/HexagonInstrFormatsV65.td
index 246a1d3..85b826f 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrFormatsV65.td
+++ b/llvm/lib/Target/Hexagon/HexagonInstrFormatsV65.td
@@ -20,11 +20,6 @@
// Instruction Classes Definitions +
//----------------------------------------------------------------------------//
-class CVI_VA_Resource_NoOpcode<dag outs, dag ins, string asmstr,
- list<dag> pattern = [], string cstr = "",
- InstrItinClass itin = CVI_VA>
- : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>;
-
class CVI_GATHER_TMP_LD_Resource_NoOpcode<dag outs, dag ins, string asmstr,
list<dag> pattern = [], string cstr = "",
InstrItinClass itin = CVI_GATHER_PSEUDO>
diff --git a/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td b/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td
deleted file mode 100644
index 44f39a3..0000000
--- a/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td
+++ /dev/null
@@ -1,414 +0,0 @@
-//===- HexagonIntrinsicsV5.td - V5 Instruction intrinsics --*- tablegen -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-def : T_PR_pat <M2_vrcmpys_s1, int_hexagon_M2_vrcmpys_s1>;
-def : T_PPR_pat<M2_vrcmpys_acc_s1, int_hexagon_M2_vrcmpys_acc_s1>;
-def : T_PR_pat <M2_vrcmpys_s1rp, int_hexagon_M2_vrcmpys_s1rp>;
-
-// Vector reduce add unsigned halfwords
-def : T_PP_pat<M2_vradduh, int_hexagon_M2_vradduh>;
-
-def: T_RP_pat<A2_addsp, int_hexagon_A2_addsp>;
-def: T_PP_pat<A2_addpsat, int_hexagon_A2_addpsat>;
-def: T_PP_pat<A2_minp, int_hexagon_A2_minp>;
-def: T_PP_pat<A2_minup, int_hexagon_A2_minup>;
-def: T_PP_pat<A2_maxp, int_hexagon_A2_maxp>;
-def: T_PP_pat<A2_maxup, int_hexagon_A2_maxup>;
-
-// Vector reduce multiply word by signed half (32x16)
-//Rdd=vrmpyweh(Rss,Rtt)[:<<1]
-def : T_PP_pat <M4_vrmpyeh_s0, int_hexagon_M4_vrmpyeh_s0>;
-def : T_PP_pat <M4_vrmpyeh_s1, int_hexagon_M4_vrmpyeh_s1>;
-
-//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
-def : T_PP_pat <M4_vrmpyoh_s0, int_hexagon_M4_vrmpyoh_s0>;
-def : T_PP_pat <M4_vrmpyoh_s1, int_hexagon_M4_vrmpyoh_s1>;
-
-//Rdd+=vrmpyweh(Rss,Rtt)[:<<1]
-def : T_PPP_pat <M4_vrmpyeh_acc_s0, int_hexagon_M4_vrmpyeh_acc_s0>;
-def : T_PPP_pat <M4_vrmpyeh_acc_s1, int_hexagon_M4_vrmpyeh_acc_s1>;
-
-//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
-def : T_PPP_pat <M4_vrmpyoh_acc_s0, int_hexagon_M4_vrmpyoh_acc_s0>;
-def : T_PPP_pat <M4_vrmpyoh_acc_s1, int_hexagon_M4_vrmpyoh_acc_s1>;
-
-// Vector multiply halfwords, signed by unsigned
-// Rdd=vmpyhsu(Rs,Rt)[:<<1]:sat
-def : T_RR_pat <M2_vmpy2su_s0, int_hexagon_M2_vmpy2su_s0>;
-def : T_RR_pat <M2_vmpy2su_s1, int_hexagon_M2_vmpy2su_s1>;
-
-// Rxx+=vmpyhsu(Rs,Rt)[:<<1]:sat
-def : T_PRR_pat <M2_vmac2su_s0, int_hexagon_M2_vmac2su_s0>;
-def : T_PRR_pat <M2_vmac2su_s1, int_hexagon_M2_vmac2su_s1>;
-
-// Vector polynomial multiply halfwords
-// Rdd=vpmpyh(Rs,Rt)
-def : T_RR_pat <M4_vpmpyh, int_hexagon_M4_vpmpyh>;
-// Rxx[^]=vpmpyh(Rs,Rt)
-def : T_PRR_pat <M4_vpmpyh_acc, int_hexagon_M4_vpmpyh_acc>;
-
-// Polynomial multiply words
-// Rdd=pmpyw(Rs,Rt)
-def : T_RR_pat <M4_pmpyw, int_hexagon_M4_pmpyw>;
-// Rxx^=pmpyw(Rs,Rt)
-def : T_PRR_pat <M4_pmpyw_acc, int_hexagon_M4_pmpyw_acc>;
-
-//Rxx^=asr(Rss,Rt)
-def : T_PPR_pat <S2_asr_r_p_xor, int_hexagon_S2_asr_r_p_xor>;
-//Rxx^=asl(Rss,Rt)
-def : T_PPR_pat <S2_asl_r_p_xor, int_hexagon_S2_asl_r_p_xor>;
-//Rxx^=lsr(Rss,Rt)
-def : T_PPR_pat <S2_lsr_r_p_xor, int_hexagon_S2_lsr_r_p_xor>;
-//Rxx^=lsl(Rss,Rt)
-def : T_PPR_pat <S2_lsl_r_p_xor, int_hexagon_S2_lsl_r_p_xor>;
-
-// Multiply and use upper result
-def : T_RR_pat <M2_mpysu_up, int_hexagon_M2_mpysu_up>;
-def : T_RR_pat <M2_mpy_up_s1, int_hexagon_M2_mpy_up_s1>;
-def : T_RR_pat <M2_hmmpyh_s1, int_hexagon_M2_hmmpyh_s1>;
-def : T_RR_pat <M2_hmmpyl_s1, int_hexagon_M2_hmmpyl_s1>;
-def : T_RR_pat <M2_mpy_up_s1_sat, int_hexagon_M2_mpy_up_s1_sat>;
-
-def : T_PP_pat <A2_vaddub, int_hexagon_A2_vaddb_map>;
-def : T_PP_pat <A2_vsubub, int_hexagon_A2_vsubb_map>;
-
-// Vector reduce add unsigned halfwords
-def : T_PP_pat <M2_vraddh, int_hexagon_M2_vraddh>;
-
-def: T_P_pat<S2_brevp, int_hexagon_S2_brevp>;
-def: T_P_pat<S2_ct0p, int_hexagon_S2_ct0p>;
-def: T_P_pat<S2_ct1p, int_hexagon_S2_ct1p>;
-
-def: T_Q_RR_pat<C4_nbitsset, int_hexagon_C4_nbitsset>;
-def: T_Q_RR_pat<C4_nbitsclr, int_hexagon_C4_nbitsclr>;
-def: T_Q_RI_pat<C4_nbitsclri, int_hexagon_C4_nbitsclri>;
-
-def : T_Q_PI_pat<A4_vcmpbeqi, int_hexagon_A4_vcmpbeqi>;
-def : T_Q_PI_pat<A4_vcmpbgti, int_hexagon_A4_vcmpbgti>;
-def : T_Q_PI_pat<A4_vcmpbgtui, int_hexagon_A4_vcmpbgtui>;
-def : T_Q_PI_pat<A4_vcmpheqi, int_hexagon_A4_vcmpheqi>;
-def : T_Q_PI_pat<A4_vcmphgti, int_hexagon_A4_vcmphgti>;
-def : T_Q_PI_pat<A4_vcmphgtui, int_hexagon_A4_vcmphgtui>;
-def : T_Q_PI_pat<A4_vcmpweqi, int_hexagon_A4_vcmpweqi>;
-def : T_Q_PI_pat<A4_vcmpwgti, int_hexagon_A4_vcmpwgti>;
-def : T_Q_PI_pat<A4_vcmpwgtui, int_hexagon_A4_vcmpwgtui>;
-def : T_Q_PP_pat<A4_vcmpbeq_any, int_hexagon_A4_vcmpbeq_any>;
-
-def : T_Q_RR_pat<A4_cmpbeq, int_hexagon_A4_cmpbeq>;
-def : T_Q_RR_pat<A4_cmpbgt, int_hexagon_A4_cmpbgt>;
-def : T_Q_RR_pat<A4_cmpbgtu, int_hexagon_A4_cmpbgtu>;
-def : T_Q_RR_pat<A4_cmpheq, int_hexagon_A4_cmpheq>;
-def : T_Q_RR_pat<A4_cmphgt, int_hexagon_A4_cmphgt>;
-def : T_Q_RR_pat<A4_cmphgtu, int_hexagon_A4_cmphgtu>;
-
-def : T_Q_RI_pat<A4_cmpbeqi, int_hexagon_A4_cmpbeqi>;
-def : T_Q_RI_pat<A4_cmpbgti, int_hexagon_A4_cmpbgti>;
-def : T_Q_RI_pat<A4_cmpbgtui, int_hexagon_A4_cmpbgtui>;
-
-def : T_Q_RI_pat<A4_cmpheqi, int_hexagon_A4_cmpheqi>;
-def : T_Q_RI_pat<A4_cmphgti, int_hexagon_A4_cmphgti>;
-def : T_Q_RI_pat<A4_cmphgtui, int_hexagon_A4_cmphgtui>;
-
-def : T_Q_RP_pat<A4_boundscheck, int_hexagon_A4_boundscheck>;
-def : T_Q_PR_pat<A4_tlbmatch, int_hexagon_A4_tlbmatch>;
-
-def : T_RRR_pat <M4_mpyrr_addr, int_hexagon_M4_mpyrr_addr>;
-def : T_IRR_pat <M4_mpyrr_addi, int_hexagon_M4_mpyrr_addi>;
-def : T_IRI_pat <M4_mpyri_addi, int_hexagon_M4_mpyri_addi>;
-def : T_RIR_pat <M4_mpyri_addr_u2, int_hexagon_M4_mpyri_addr_u2>;
-def : T_RRI_pat <M4_mpyri_addr, int_hexagon_M4_mpyri_addr>;
-def : T_RRR_pat <M4_mac_up_s1_sat, int_hexagon_M4_mac_up_s1_sat>;
-def : T_RRR_pat <M4_nac_up_s1_sat, int_hexagon_M4_nac_up_s1_sat>;
-
-// Complex multiply 32x16
-def : T_PR_pat <M4_cmpyi_wh, int_hexagon_M4_cmpyi_wh>;
-def : T_PR_pat <M4_cmpyr_wh, int_hexagon_M4_cmpyr_wh>;
-
-def : T_PR_pat <M4_cmpyi_whc, int_hexagon_M4_cmpyi_whc>;
-def : T_PR_pat <M4_cmpyr_whc, int_hexagon_M4_cmpyr_whc>;
-
-def : T_PP_pat<A4_andnp, int_hexagon_A4_andnp>;
-def : T_PP_pat<A4_ornp, int_hexagon_A4_ornp>;
-
-// Complex add/sub halfwords/words
-def : T_PP_pat <S4_vxaddsubw, int_hexagon_S4_vxaddsubw>;
-def : T_PP_pat <S4_vxsubaddw, int_hexagon_S4_vxsubaddw>;
-def : T_PP_pat <S4_vxaddsubh, int_hexagon_S4_vxaddsubh>;
-def : T_PP_pat <S4_vxsubaddh, int_hexagon_S4_vxsubaddh>;
-
-def : T_PP_pat <S4_vxaddsubhr, int_hexagon_S4_vxaddsubhr>;
-def : T_PP_pat <S4_vxsubaddhr, int_hexagon_S4_vxsubaddhr>;
-
-// Extract bitfield
-def : T_PP_pat <S4_extractp_rp, int_hexagon_S4_extractp_rp>;
-def : T_RP_pat <S4_extract_rp, int_hexagon_S4_extract_rp>;
-def : T_PII_pat <S4_extractp, int_hexagon_S4_extractp>;
-def : T_RII_pat <S4_extract, int_hexagon_S4_extract>;
-
-// Vector conditional negate
-// Rdd=vcnegh(Rss,Rt)
-def : T_PR_pat <S2_vcnegh, int_hexagon_S2_vcnegh>;
-
-// Shift an immediate left by register amount
-def : T_IR_pat<S4_lsli, int_hexagon_S4_lsli>;
-
-// Vector reduce maximum halfwords
-def : T_PPR_pat <A4_vrmaxh, int_hexagon_A4_vrmaxh>;
-def : T_PPR_pat <A4_vrmaxuh, int_hexagon_A4_vrmaxuh>;
-
-// Vector reduce maximum words
-def : T_PPR_pat <A4_vrmaxw, int_hexagon_A4_vrmaxw>;
-def : T_PPR_pat <A4_vrmaxuw, int_hexagon_A4_vrmaxuw>;
-
-// Vector reduce minimum halfwords
-def : T_PPR_pat <A4_vrminh, int_hexagon_A4_vrminh>;
-def : T_PPR_pat <A4_vrminuh, int_hexagon_A4_vrminuh>;
-
-// Vector reduce minimum words
-def : T_PPR_pat <A4_vrminw, int_hexagon_A4_vrminw>;
-def : T_PPR_pat <A4_vrminuw, int_hexagon_A4_vrminuw>;
-
-// Rotate and reduce bytes
-def : Pat <(int_hexagon_S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2,
- u2_0ImmPred:$src3),
- (S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred:$src3)>;
-
-// Rotate and reduce bytes with accumulation
-// Rxx+=vrcrotate(Rss,Rt,#u2)
-def : Pat <(int_hexagon_S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2,
- IntRegs:$src3, u2_0ImmPred:$src4),
- (S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2,
- IntRegs:$src3, u2_0ImmPred:$src4)>;
-
-// Vector conditional negate
-def : T_PPR_pat<S2_vrcnegh, int_hexagon_S2_vrcnegh>;
-
-// Logical xor with xor accumulation
-def : T_PPP_pat<M4_xor_xacc, int_hexagon_M4_xor_xacc>;
-
-// ALU64 - Vector min/max byte
-def : T_PP_pat <A2_vminb, int_hexagon_A2_vminb>;
-def : T_PP_pat <A2_vmaxb, int_hexagon_A2_vmaxb>;
-
-// Shift and add/sub/and/or
-def : T_IRI_pat <S4_andi_asl_ri, int_hexagon_S4_andi_asl_ri>;
-def : T_IRI_pat <S4_ori_asl_ri, int_hexagon_S4_ori_asl_ri>;
-def : T_IRI_pat <S4_addi_asl_ri, int_hexagon_S4_addi_asl_ri>;
-def : T_IRI_pat <S4_subi_asl_ri, int_hexagon_S4_subi_asl_ri>;
-def : T_IRI_pat <S4_andi_lsr_ri, int_hexagon_S4_andi_lsr_ri>;
-def : T_IRI_pat <S4_ori_lsr_ri, int_hexagon_S4_ori_lsr_ri>;
-def : T_IRI_pat <S4_addi_lsr_ri, int_hexagon_S4_addi_lsr_ri>;
-def : T_IRI_pat <S4_subi_lsr_ri, int_hexagon_S4_subi_lsr_ri>;
-
-// Split bitfield
-def : T_RI_pat <A4_bitspliti, int_hexagon_A4_bitspliti>;
-def : T_RR_pat <A4_bitsplit, int_hexagon_A4_bitsplit>;
-
-def: T_RR_pat<S4_parity, int_hexagon_S4_parity>;
-
-def: T_Q_RI_pat<S4_ntstbit_i, int_hexagon_S4_ntstbit_i>;
-def: T_Q_RR_pat<S4_ntstbit_r, int_hexagon_S4_ntstbit_r>;
-
-def: T_RI_pat<S4_clbaddi, int_hexagon_S4_clbaddi>;
-def: T_PI_pat<S4_clbpaddi, int_hexagon_S4_clbpaddi>;
-def: T_P_pat <S4_clbpnorm, int_hexagon_S4_clbpnorm>;
-
-//*******************************************************************
-// ALU32/ALU
-//*******************************************************************
-
-// ALU32 / ALU / Logical Operations.
-def: T_RR_pat<A4_andn, int_hexagon_A4_andn>;
-def: T_RR_pat<A4_orn, int_hexagon_A4_orn>;
-
-//*******************************************************************
-// ALU32/PERM
-//*******************************************************************
-
-// Combine Words Into Doublewords.
-def: T_RI_pat<A4_combineri, int_hexagon_A4_combineri, s32_0ImmPred>;
-def: T_IR_pat<A4_combineir, int_hexagon_A4_combineir, s32_0ImmPred>;
-
-//*******************************************************************
-// ALU32/PRED
-//*******************************************************************
-
-// Compare
-def : T_Q_RI_pat<C4_cmpneqi, int_hexagon_C4_cmpneqi, s32_0ImmPred>;
-def : T_Q_RI_pat<C4_cmpltei, int_hexagon_C4_cmpltei, s32_0ImmPred>;
-def : T_Q_RI_pat<C4_cmplteui, int_hexagon_C4_cmplteui, u32_0ImmPred>;
-
-// Compare To General Register.
-def: T_Q_RR_pat<C4_cmpneq, int_hexagon_C4_cmpneq>;
-def: T_Q_RR_pat<C4_cmplte, int_hexagon_C4_cmplte>;
-def: T_Q_RR_pat<C4_cmplteu, int_hexagon_C4_cmplteu>;
-
-def: T_RR_pat<A4_rcmpeq, int_hexagon_A4_rcmpeq>;
-def: T_RR_pat<A4_rcmpneq, int_hexagon_A4_rcmpneq>;
-
-def: T_RI_pat<A4_rcmpeqi, int_hexagon_A4_rcmpeqi>;
-def: T_RI_pat<A4_rcmpneqi, int_hexagon_A4_rcmpneqi>;
-
-//*******************************************************************
-// CR
-//*******************************************************************
-
-// CR / Logical Operations On Predicates.
-def: T_Q_QQQ_pat<C4_and_and, int_hexagon_C4_and_and>;
-def: T_Q_QQQ_pat<C4_and_andn, int_hexagon_C4_and_andn>;
-def: T_Q_QQQ_pat<C4_and_or, int_hexagon_C4_and_or>;
-def: T_Q_QQQ_pat<C4_and_orn, int_hexagon_C4_and_orn>;
-def: T_Q_QQQ_pat<C4_or_and, int_hexagon_C4_or_and>;
-def: T_Q_QQQ_pat<C4_or_andn, int_hexagon_C4_or_andn>;
-def: T_Q_QQQ_pat<C4_or_or, int_hexagon_C4_or_or>;
-def: T_Q_QQQ_pat<C4_or_orn, int_hexagon_C4_or_orn>;
-
-//*******************************************************************
-// XTYPE/ALU
-//*******************************************************************
-
-// Add And Accumulate.
-
-def : T_RRI_pat <S4_addaddi, int_hexagon_S4_addaddi>;
-def : T_RIR_pat <S4_subaddi, int_hexagon_S4_subaddi>;
-
-
-// XTYPE / ALU / Logical-logical Words.
-def : T_RRR_pat <M4_or_xor, int_hexagon_M4_or_xor>;
-def : T_RRR_pat <M4_and_xor, int_hexagon_M4_and_xor>;
-def : T_RRR_pat <M4_or_and, int_hexagon_M4_or_and>;
-def : T_RRR_pat <M4_and_and, int_hexagon_M4_and_and>;
-def : T_RRR_pat <M4_xor_and, int_hexagon_M4_xor_and>;
-def : T_RRR_pat <M4_or_or, int_hexagon_M4_or_or>;
-def : T_RRR_pat <M4_and_or, int_hexagon_M4_and_or>;
-def : T_RRR_pat <M4_xor_or, int_hexagon_M4_xor_or>;
-def : T_RRR_pat <M4_or_andn, int_hexagon_M4_or_andn>;
-def : T_RRR_pat <M4_and_andn, int_hexagon_M4_and_andn>;
-def : T_RRR_pat <M4_xor_andn, int_hexagon_M4_xor_andn>;
-
-def : T_RRI_pat <S4_or_andi, int_hexagon_S4_or_andi>;
-def : T_RRI_pat <S4_or_andix, int_hexagon_S4_or_andix>;
-def : T_RRI_pat <S4_or_ori, int_hexagon_S4_or_ori>;
-
-// Modulo wrap.
-def : T_RR_pat <A4_modwrapu, int_hexagon_A4_modwrapu>;
-
-// Arithmetic/Convergent round
-// Rd=[cround|round](Rs,Rt)[:sat]
-// Rd=[cround|round](Rs,#u5)[:sat]
-def : T_RI_pat <A4_cround_ri, int_hexagon_A4_cround_ri>;
-def : T_RR_pat <A4_cround_rr, int_hexagon_A4_cround_rr>;
-
-def : T_RI_pat <A4_round_ri, int_hexagon_A4_round_ri>;
-def : T_RR_pat <A4_round_rr, int_hexagon_A4_round_rr>;
-
-def : T_RI_pat <A4_round_ri_sat, int_hexagon_A4_round_ri_sat>;
-def : T_RR_pat <A4_round_rr_sat, int_hexagon_A4_round_rr_sat>;
-
-def : T_P_pat <A2_roundsat, int_hexagon_A2_roundsat>;
-
-//Rdd[+]=vrmpybsu(Rss,Rtt)
-//Rdd[+]=vrmpybuu(Rss,Rtt)
-def : T_PP_pat <M5_vrmpybsu, int_hexagon_M5_vrmpybsu>;
-def : T_PP_pat <M5_vrmpybuu, int_hexagon_M5_vrmpybuu>;
-
-def : T_PP_pat <M5_vdmpybsu, int_hexagon_M5_vdmpybsu>;
-
-def : T_PPP_pat <M5_vrmacbsu, int_hexagon_M5_vrmacbsu>;
-def : T_PPP_pat <M5_vrmacbuu, int_hexagon_M5_vrmacbuu>;
-//Rxx+=vdmpybsu(Rss,Rtt):sat
-def : T_PPP_pat <M5_vdmacbsu, int_hexagon_M5_vdmacbsu>;
-
-// Vector multiply bytes
-// Rdd=vmpyb[s]u(Rs,Rt)
-def : T_RR_pat <M5_vmpybsu, int_hexagon_M5_vmpybsu>;
-def : T_RR_pat <M5_vmpybuu, int_hexagon_M5_vmpybuu>;
-
-// Rxx+=vmpyb[s]u(Rs,Rt)
-def : T_PRR_pat <M5_vmacbsu, int_hexagon_M5_vmacbsu>;
-def : T_PRR_pat <M5_vmacbuu, int_hexagon_M5_vmacbuu>;
-
-// Rd=vaddhub(Rss,Rtt):sat
-def : T_PP_pat <A5_vaddhubs, int_hexagon_A5_vaddhubs>;
-
-def : T_FF_pat<F2_sfadd, int_hexagon_F2_sfadd>;
-def : T_FF_pat<F2_sfsub, int_hexagon_F2_sfsub>;
-def : T_FF_pat<F2_sfmpy, int_hexagon_F2_sfmpy>;
-def : T_FF_pat<F2_sfmax, int_hexagon_F2_sfmax>;
-def : T_FF_pat<F2_sfmin, int_hexagon_F2_sfmin>;
-
-def : T_FF_pat<F2_sffixupn, int_hexagon_F2_sffixupn>;
-def : T_FF_pat<F2_sffixupd, int_hexagon_F2_sffixupd>;
-def : T_F_pat <F2_sffixupr, int_hexagon_F2_sffixupr>;
-
-def : T_Q_QQ_pat<C4_fastcorner9, int_hexagon_C4_fastcorner9>;
-def : T_Q_QQ_pat<C4_fastcorner9_not, int_hexagon_C4_fastcorner9_not>;
-
-def : T_P_pat <S5_popcountp, int_hexagon_S5_popcountp>;
-def : T_PI_pat <S5_asrhub_sat, int_hexagon_S5_asrhub_sat>;
-
-def : T_PI_pat <S2_asr_i_p_rnd, int_hexagon_S2_asr_i_p_rnd>;
-def : T_PI_pat <S2_asr_i_p_rnd_goodsyntax,
- int_hexagon_S2_asr_i_p_rnd_goodsyntax>;
-
-def : T_PI_pat <S5_asrhub_rnd_sat_goodsyntax,
- int_hexagon_S5_asrhub_rnd_sat_goodsyntax>;
-
-def : T_PI_pat <S5_vasrhrnd_goodsyntax, int_hexagon_S5_vasrhrnd_goodsyntax>;
-
-def : T_FFF_pat <F2_sffma, int_hexagon_F2_sffma>;
-def : T_FFF_pat <F2_sffms, int_hexagon_F2_sffms>;
-def : T_FFF_pat <F2_sffma_lib, int_hexagon_F2_sffma_lib>;
-def : T_FFF_pat <F2_sffms_lib, int_hexagon_F2_sffms_lib>;
-def : T_FFFQ_pat <F2_sffma_sc, int_hexagon_F2_sffma_sc>;
-
-// Compare floating-point value
-def : T_Q_FF_pat <F2_sfcmpge, int_hexagon_F2_sfcmpge>;
-def : T_Q_FF_pat <F2_sfcmpuo, int_hexagon_F2_sfcmpuo>;
-def : T_Q_FF_pat <F2_sfcmpeq, int_hexagon_F2_sfcmpeq>;
-def : T_Q_FF_pat <F2_sfcmpgt, int_hexagon_F2_sfcmpgt>;
-
-def : T_Q_DD_pat <F2_dfcmpeq, int_hexagon_F2_dfcmpeq>;
-def : T_Q_DD_pat <F2_dfcmpgt, int_hexagon_F2_dfcmpgt>;
-def : T_Q_DD_pat <F2_dfcmpge, int_hexagon_F2_dfcmpge>;
-def : T_Q_DD_pat <F2_dfcmpuo, int_hexagon_F2_dfcmpuo>;
-
-// Create floating-point value
-def : T_I_pat <F2_sfimm_p, int_hexagon_F2_sfimm_p>;
-def : T_I_pat <F2_sfimm_n, int_hexagon_F2_sfimm_n>;
-def : T_I_pat <F2_dfimm_p, int_hexagon_F2_dfimm_p>;
-def : T_I_pat <F2_dfimm_n, int_hexagon_F2_dfimm_n>;
-
-def : T_Q_DI_pat <F2_dfclass, int_hexagon_F2_dfclass>;
-def : T_Q_FI_pat <F2_sfclass, int_hexagon_F2_sfclass>;
-def : T_F_pat <F2_conv_sf2df, int_hexagon_F2_conv_sf2df>;
-def : T_D_pat <F2_conv_df2sf, int_hexagon_F2_conv_df2sf>;
-def : T_R_pat <F2_conv_uw2sf, int_hexagon_F2_conv_uw2sf>;
-def : T_R_pat <F2_conv_uw2df, int_hexagon_F2_conv_uw2df>;
-def : T_R_pat <F2_conv_w2sf, int_hexagon_F2_conv_w2sf>;
-def : T_R_pat <F2_conv_w2df, int_hexagon_F2_conv_w2df>;
-def : T_P_pat <F2_conv_ud2sf, int_hexagon_F2_conv_ud2sf>;
-def : T_P_pat <F2_conv_ud2df, int_hexagon_F2_conv_ud2df>;
-def : T_P_pat <F2_conv_d2sf, int_hexagon_F2_conv_d2sf>;
-def : T_P_pat <F2_conv_d2df, int_hexagon_F2_conv_d2df>;
-def : T_F_pat <F2_conv_sf2uw, int_hexagon_F2_conv_sf2uw>;
-def : T_F_pat <F2_conv_sf2w, int_hexagon_F2_conv_sf2w>;
-def : T_F_pat <F2_conv_sf2ud, int_hexagon_F2_conv_sf2ud>;
-def : T_F_pat <F2_conv_sf2d, int_hexagon_F2_conv_sf2d>;
-def : T_D_pat <F2_conv_df2uw, int_hexagon_F2_conv_df2uw>;
-def : T_D_pat <F2_conv_df2w, int_hexagon_F2_conv_df2w>;
-def : T_D_pat <F2_conv_df2ud, int_hexagon_F2_conv_df2ud>;
-def : T_D_pat <F2_conv_df2d, int_hexagon_F2_conv_df2d>;
-def : T_F_pat <F2_conv_sf2uw_chop, int_hexagon_F2_conv_sf2uw_chop>;
-def : T_F_pat <F2_conv_sf2w_chop, int_hexagon_F2_conv_sf2w_chop>;
-def : T_F_pat <F2_conv_sf2ud_chop, int_hexagon_F2_conv_sf2ud_chop>;
-def : T_F_pat <F2_conv_sf2d_chop, int_hexagon_F2_conv_sf2d_chop>;
-def : T_D_pat <F2_conv_df2uw_chop, int_hexagon_F2_conv_df2uw_chop>;
-def : T_D_pat <F2_conv_df2w_chop, int_hexagon_F2_conv_df2w_chop>;
-def : T_D_pat <F2_conv_df2ud_chop, int_hexagon_F2_conv_df2ud_chop>;
-def : T_D_pat <F2_conv_df2d_chop, int_hexagon_F2_conv_df2d_chop>;
diff --git a/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td b/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
deleted file mode 100644
index 796979e..0000000
--- a/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
+++ /dev/null
@@ -1,642 +0,0 @@
-//===- HexagonIntrinsicsV60.td - V60 instruction intrinsics -*- tablegen *-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon V60 Compiler Intrinsics in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-
-let AddedComplexity = 100 in {
-def : Pat < (v16i32 (int_hexagon_V6_lo (v32i32 HvxWR:$src1))),
- (v16i32 (EXTRACT_SUBREG (v32i32 HvxWR:$src1), vsub_lo)) >;
-
-def : Pat < (v16i32 (int_hexagon_V6_hi (v32i32 HvxWR:$src1))),
- (v16i32 (EXTRACT_SUBREG (v32i32 HvxWR:$src1), vsub_hi)) >;
-
-def : Pat < (v32i32 (int_hexagon_V6_lo_128B (v64i32 HvxWR:$src1))),
- (v32i32 (EXTRACT_SUBREG (v64i32 HvxWR:$src1), vsub_lo)) >;
-
-def : Pat < (v32i32 (int_hexagon_V6_hi_128B (v64i32 HvxWR:$src1))),
- (v32i32 (EXTRACT_SUBREG (v64i32 HvxWR:$src1), vsub_hi)) >;
-}
-
-def : Pat <(v64i1 (bitconvert (v16i32 HvxVR:$src1))),
- (v64i1 (V6_vandvrt(v16i32 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v64i1 (bitconvert (v32i16 HvxVR:$src1))),
- (v64i1 (V6_vandvrt(v32i16 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v64i1 (bitconvert (v64i8 HvxVR:$src1))),
- (v64i1 (V6_vandvrt(v64i8 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v16i32 (bitconvert (v64i1 HvxQR:$src1))),
- (v16i32 (V6_vandqrt(v64i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v32i16 (bitconvert (v64i1 HvxQR:$src1))),
- (v32i16 (V6_vandqrt(v64i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v64i8 (bitconvert (v64i1 HvxQR:$src1))),
- (v64i8 (V6_vandqrt(v64i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v128i1 (bitconvert (v32i32 HvxVR:$src1))),
- (v128i1 (V6_vandvrt (v32i32 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v128i1 (bitconvert (v64i16 HvxVR:$src1))),
- (v128i1 (V6_vandvrt (v64i16 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v128i1 (bitconvert (v128i8 HvxVR:$src1))),
- (v128i1 (V6_vandvrt (v128i8 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v32i32 (bitconvert (v128i1 HvxQR:$src1))),
- (v32i32 (V6_vandqrt (v128i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v64i16 (bitconvert (v128i1 HvxQR:$src1))),
- (v64i16 (V6_vandqrt (v128i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v128i8 (bitconvert (v128i1 HvxQR:$src1))),
- (v128i8 (V6_vandqrt (v128i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
-
-let AddedComplexity = 140 in {
-def : Pat <(store (v64i1 HvxQR:$src1), (i32 IntRegs:$addr)),
- (V6_vS32b_ai IntRegs:$addr, 0,
- (v16i32 (V6_vandqrt (v64i1 HvxQR:$src1),
- (A2_tfrsi 0x01010101))))>;
-
-def : Pat <(v64i1 (load (i32 IntRegs:$addr))),
- (v64i1 (V6_vandvrt
- (v16i32 (V6_vL32b_ai IntRegs:$addr, 0)), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(store (v128i1 HvxQR:$src1), (i32 IntRegs:$addr)),
- (V6_vS32b_ai IntRegs:$addr, 0,
- (v32i32 (V6_vandqrt (v128i1 HvxQR:$src1),
- (A2_tfrsi 0x01010101))))>;
-
-def : Pat <(v128i1 (load (i32 IntRegs:$addr))),
- (v128i1 (V6_vandvrt
- (v32i32 (V6_vL32b_ai IntRegs:$addr, 0)), (A2_tfrsi 0x01010101)))>;
-}
-
-multiclass T_R_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID IntRegs:$src1), (MI IntRegs:$src1)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") IntRegs:$src1),
- (MI IntRegs:$src1)>;
-}
-
-multiclass T_V_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1),
- (MI HvxVR:$src1)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1),
- (MI HvxVR:$src1)>;
-}
-
-multiclass T_W_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1),
- (MI HvxWR:$src1)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1),
- (MI HvxWR:$src1)>;
-}
-
-multiclass T_Q_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxQR:$src1),
- (MI HvxQR:$src1)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1),
- (MI HvxQR:$src1)>;
-}
-
-multiclass T_WR_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, IntRegs:$src2),
- (MI HvxWR:$src1, IntRegs:$src2)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B")HvxWR:$src1, IntRegs:$src2),
- (MI HvxWR:$src1, IntRegs:$src2)>;
-}
-
-multiclass T_VR_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, IntRegs:$src2),
- (MI HvxVR:$src1, IntRegs:$src2)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B")HvxVR:$src1, IntRegs:$src2),
- (MI HvxVR:$src1, IntRegs:$src2)>;
-}
-
-multiclass T_WV_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, HvxVR:$src2),
- (MI HvxWR:$src1, HvxVR:$src2)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2),
- (MI HvxWR:$src1, HvxVR:$src2)>;
-}
-
-multiclass T_WW_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, HvxWR:$src2),
- (MI HvxWR:$src1, HvxWR:$src2)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxWR:$src2),
- (MI HvxWR:$src1, HvxWR:$src2)>;
-}
-
-multiclass T_VV_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxVR:$src2),
- (MI HvxVR:$src1, HvxVR:$src2)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2),
- (MI HvxVR:$src1, HvxVR:$src2)>;
-}
-
-multiclass T_QR_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxQR:$src1, IntRegs:$src2),
- (MI HvxQR:$src1, IntRegs:$src2)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, IntRegs:$src2),
- (MI HvxQR:$src1, IntRegs:$src2)>;
-}
-
-multiclass T_QQ_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxQR:$src1, HvxQR:$src2),
- (MI HvxQR:$src1, HvxQR:$src2)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, HvxQR:$src2),
- (MI HvxQR:$src1, HvxQR:$src2)>;
-}
-
-multiclass T_WWR_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
- (MI HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxWR:$src2,
- IntRegs:$src3),
- (MI HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_VVR_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
- IntRegs:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_WVR_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
- (MI HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2,
- IntRegs:$src3),
- (MI HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_VWR_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxWR:$src2, IntRegs:$src3),
- (MI HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxWR:$src2,
- IntRegs:$src3),
- (MI HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_VVV_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
- HvxVR:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>;
-}
-
-multiclass T_WVV_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
- (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2,
- HvxVR:$src3),
- (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>;
-}
-
-multiclass T_QVV_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
- (MI HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, HvxVR:$src2,
- HvxVR:$src3),
- (MI HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>;
-}
-
-multiclass T_VQR_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxQR:$src2, IntRegs:$src3),
- (MI HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxQR:$src2,
- IntRegs:$src3),
- (MI HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>;
-}
-
-
-multiclass T_QVR_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxQR:$src1, HvxVR:$src2, IntRegs:$src3),
- (MI HvxQR:$src1, HvxVR:$src2, IntRegs:$src3)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, HvxVR:$src2,
- IntRegs:$src3),
- (MI HvxQR:$src1, HvxVR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_VVI_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, imm:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, imm:$src3)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1,
- HvxVR:$src2, imm:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, imm:$src3)>;
-}
-
-multiclass T_WRI_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, IntRegs:$src2, imm:$src3),
- (MI HvxWR:$src1, IntRegs:$src2, imm:$src3)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1,
- IntRegs:$src2, imm:$src3),
- (MI HvxWR:$src1, IntRegs:$src2, imm:$src3)>;
-}
-
-multiclass T_WWRI_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, imm:$src4),
- (MI HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, imm:$src4)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxWR:$src2,
- IntRegs:$src3, imm:$src4),
- (MI HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, imm:$src4)>;
-}
-
-multiclass T_VVVR_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4),
- (MI HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
- HvxVR:$src3, IntRegs:$src4),
- (MI HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4)>;
-}
-
-multiclass T_WVVR_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4),
- (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4)>;
-
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2,
- HvxVR:$src3, IntRegs:$src4),
- (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4)>;
-}
-
-defm : T_WR_pat <V6_vtmpyb, int_hexagon_V6_vtmpyb>;
-defm : T_WR_pat <V6_vtmpybus, int_hexagon_V6_vtmpybus>;
-defm : T_VR_pat <V6_vdmpyhb, int_hexagon_V6_vdmpyhb>;
-defm : T_VR_pat <V6_vrmpyub, int_hexagon_V6_vrmpyub>;
-defm : T_VR_pat <V6_vrmpybus, int_hexagon_V6_vrmpybus>;
-defm : T_WR_pat <V6_vdsaduh, int_hexagon_V6_vdsaduh>;
-defm : T_VR_pat <V6_vdmpybus, int_hexagon_V6_vdmpybus>;
-defm : T_WR_pat <V6_vdmpybus_dv, int_hexagon_V6_vdmpybus_dv>;
-defm : T_VR_pat <V6_vdmpyhsusat, int_hexagon_V6_vdmpyhsusat>;
-defm : T_WR_pat <V6_vdmpyhsuisat, int_hexagon_V6_vdmpyhsuisat>;
-defm : T_VR_pat <V6_vdmpyhsat, int_hexagon_V6_vdmpyhsat>;
-defm : T_WR_pat <V6_vdmpyhisat, int_hexagon_V6_vdmpyhisat>;
-defm : T_WR_pat <V6_vdmpyhb_dv, int_hexagon_V6_vdmpyhb_dv>;
-defm : T_VR_pat <V6_vmpybus, int_hexagon_V6_vmpybus>;
-defm : T_WR_pat <V6_vmpabus, int_hexagon_V6_vmpabus>;
-defm : T_WR_pat <V6_vmpahb, int_hexagon_V6_vmpahb>;
-defm : T_VR_pat <V6_vmpyh, int_hexagon_V6_vmpyh>;
-defm : T_VR_pat <V6_vmpyhss, int_hexagon_V6_vmpyhss>;
-defm : T_VR_pat <V6_vmpyhsrs, int_hexagon_V6_vmpyhsrs>;
-defm : T_VR_pat <V6_vmpyuh, int_hexagon_V6_vmpyuh>;
-defm : T_VR_pat <V6_vmpyihb, int_hexagon_V6_vmpyihb>;
-defm : T_VR_pat <V6_vror, int_hexagon_V6_vror>;
-defm : T_VR_pat <V6_vasrw, int_hexagon_V6_vasrw>;
-defm : T_VR_pat <V6_vasrh, int_hexagon_V6_vasrh>;
-defm : T_VR_pat <V6_vaslw, int_hexagon_V6_vaslw>;
-defm : T_VR_pat <V6_vaslh, int_hexagon_V6_vaslh>;
-defm : T_VR_pat <V6_vlsrw, int_hexagon_V6_vlsrw>;
-defm : T_VR_pat <V6_vlsrh, int_hexagon_V6_vlsrh>;
-defm : T_VR_pat <V6_vmpyiwh, int_hexagon_V6_vmpyiwh>;
-defm : T_VR_pat <V6_vmpyiwb, int_hexagon_V6_vmpyiwb>;
-defm : T_WR_pat <V6_vtmpyhb, int_hexagon_V6_vtmpyhb>;
-defm : T_VR_pat <V6_vmpyub, int_hexagon_V6_vmpyub>;
-
-defm : T_VV_pat <V6_vrmpyubv, int_hexagon_V6_vrmpyubv>;
-defm : T_VV_pat <V6_vrmpybv, int_hexagon_V6_vrmpybv>;
-defm : T_VV_pat <V6_vrmpybusv, int_hexagon_V6_vrmpybusv>;
-defm : T_VV_pat <V6_vdmpyhvsat, int_hexagon_V6_vdmpyhvsat>;
-defm : T_VV_pat <V6_vmpybv, int_hexagon_V6_vmpybv>;
-defm : T_VV_pat <V6_vmpyubv, int_hexagon_V6_vmpyubv>;
-defm : T_VV_pat <V6_vmpybusv, int_hexagon_V6_vmpybusv>;
-defm : T_VV_pat <V6_vmpyhv, int_hexagon_V6_vmpyhv>;
-defm : T_VV_pat <V6_vmpyuhv, int_hexagon_V6_vmpyuhv>;
-defm : T_VV_pat <V6_vmpyhvsrs, int_hexagon_V6_vmpyhvsrs>;
-defm : T_VV_pat <V6_vmpyhus, int_hexagon_V6_vmpyhus>;
-defm : T_WW_pat <V6_vmpabusv, int_hexagon_V6_vmpabusv>;
-defm : T_VV_pat <V6_vmpyih, int_hexagon_V6_vmpyih>;
-defm : T_VV_pat <V6_vand, int_hexagon_V6_vand>;
-defm : T_VV_pat <V6_vor, int_hexagon_V6_vor>;
-defm : T_VV_pat <V6_vxor, int_hexagon_V6_vxor>;
-defm : T_VV_pat <V6_vaddw, int_hexagon_V6_vaddw>;
-defm : T_VV_pat <V6_vaddubsat, int_hexagon_V6_vaddubsat>;
-defm : T_VV_pat <V6_vadduhsat, int_hexagon_V6_vadduhsat>;
-defm : T_VV_pat <V6_vaddhsat, int_hexagon_V6_vaddhsat>;
-defm : T_VV_pat <V6_vaddwsat, int_hexagon_V6_vaddwsat>;
-defm : T_VV_pat <V6_vsubb, int_hexagon_V6_vsubb>;
-defm : T_VV_pat <V6_vsubh, int_hexagon_V6_vsubh>;
-defm : T_VV_pat <V6_vsubw, int_hexagon_V6_vsubw>;
-defm : T_VV_pat <V6_vsububsat, int_hexagon_V6_vsububsat>;
-defm : T_VV_pat <V6_vsubuhsat, int_hexagon_V6_vsubuhsat>;
-defm : T_VV_pat <V6_vsubhsat, int_hexagon_V6_vsubhsat>;
-defm : T_VV_pat <V6_vsubwsat, int_hexagon_V6_vsubwsat>;
-defm : T_WW_pat <V6_vaddb_dv, int_hexagon_V6_vaddb_dv>;
-defm : T_WW_pat <V6_vaddh_dv, int_hexagon_V6_vaddh_dv>;
-defm : T_WW_pat <V6_vaddw_dv, int_hexagon_V6_vaddw_dv>;
-defm : T_WW_pat <V6_vaddubsat_dv, int_hexagon_V6_vaddubsat_dv>;
-defm : T_WW_pat <V6_vadduhsat_dv, int_hexagon_V6_vadduhsat_dv>;
-defm : T_WW_pat <V6_vaddhsat_dv, int_hexagon_V6_vaddhsat_dv>;
-defm : T_WW_pat <V6_vaddwsat_dv, int_hexagon_V6_vaddwsat_dv>;
-defm : T_WW_pat <V6_vsubb_dv, int_hexagon_V6_vsubb_dv>;
-defm : T_WW_pat <V6_vsubh_dv, int_hexagon_V6_vsubh_dv>;
-defm : T_WW_pat <V6_vsubw_dv, int_hexagon_V6_vsubw_dv>;
-defm : T_WW_pat <V6_vsububsat_dv, int_hexagon_V6_vsububsat_dv>;
-defm : T_WW_pat <V6_vsubuhsat_dv, int_hexagon_V6_vsubuhsat_dv>;
-defm : T_WW_pat <V6_vsubhsat_dv, int_hexagon_V6_vsubhsat_dv>;
-defm : T_WW_pat <V6_vsubwsat_dv, int_hexagon_V6_vsubwsat_dv>;
-defm : T_VV_pat <V6_vaddubh, int_hexagon_V6_vaddubh>;
-defm : T_VV_pat <V6_vadduhw, int_hexagon_V6_vadduhw>;
-defm : T_VV_pat <V6_vaddhw, int_hexagon_V6_vaddhw>;
-defm : T_VV_pat <V6_vsububh, int_hexagon_V6_vsububh>;
-defm : T_VV_pat <V6_vsubuhw, int_hexagon_V6_vsubuhw>;
-defm : T_VV_pat <V6_vsubhw, int_hexagon_V6_vsubhw>;
-defm : T_VV_pat <V6_vabsdiffub, int_hexagon_V6_vabsdiffub>;
-defm : T_VV_pat <V6_vabsdiffh, int_hexagon_V6_vabsdiffh>;
-defm : T_VV_pat <V6_vabsdiffuh, int_hexagon_V6_vabsdiffuh>;
-defm : T_VV_pat <V6_vabsdiffw, int_hexagon_V6_vabsdiffw>;
-defm : T_VV_pat <V6_vavgub, int_hexagon_V6_vavgub>;
-defm : T_VV_pat <V6_vavguh, int_hexagon_V6_vavguh>;
-defm : T_VV_pat <V6_vavgh, int_hexagon_V6_vavgh>;
-defm : T_VV_pat <V6_vavgw, int_hexagon_V6_vavgw>;
-defm : T_VV_pat <V6_vnavgub, int_hexagon_V6_vnavgub>;
-defm : T_VV_pat <V6_vnavgh, int_hexagon_V6_vnavgh>;
-defm : T_VV_pat <V6_vnavgw, int_hexagon_V6_vnavgw>;
-defm : T_VV_pat <V6_vavgubrnd, int_hexagon_V6_vavgubrnd>;
-defm : T_VV_pat <V6_vavguhrnd, int_hexagon_V6_vavguhrnd>;
-defm : T_VV_pat <V6_vavghrnd, int_hexagon_V6_vavghrnd>;
-defm : T_VV_pat <V6_vavgwrnd, int_hexagon_V6_vavgwrnd>;
-defm : T_WW_pat <V6_vmpabuuv, int_hexagon_V6_vmpabuuv>;
-
-defm : T_VVR_pat <V6_vdmpyhb_acc, int_hexagon_V6_vdmpyhb_acc>;
-defm : T_VVR_pat <V6_vrmpyub_acc, int_hexagon_V6_vrmpyub_acc>;
-defm : T_VVR_pat <V6_vrmpybus_acc, int_hexagon_V6_vrmpybus_acc>;
-defm : T_VVR_pat <V6_vdmpybus_acc, int_hexagon_V6_vdmpybus_acc>;
-defm : T_VVR_pat <V6_vdmpyhsusat_acc, int_hexagon_V6_vdmpyhsusat_acc>;
-defm : T_VVR_pat <V6_vdmpyhsat_acc, int_hexagon_V6_vdmpyhsat_acc>;
-defm : T_VVR_pat <V6_vmpyiwb_acc, int_hexagon_V6_vmpyiwb_acc>;
-defm : T_VVR_pat <V6_vmpyiwh_acc, int_hexagon_V6_vmpyiwh_acc>;
-defm : T_VVR_pat <V6_vmpyihb_acc, int_hexagon_V6_vmpyihb_acc>;
-defm : T_VVR_pat <V6_vaslw_acc, int_hexagon_V6_vaslw_acc>;
-defm : T_VVR_pat <V6_vasrw_acc, int_hexagon_V6_vasrw_acc>;
-
-defm : T_VWR_pat <V6_vdmpyhsuisat_acc, int_hexagon_V6_vdmpyhsuisat_acc>;
-defm : T_VWR_pat <V6_vdmpyhisat_acc, int_hexagon_V6_vdmpyhisat_acc>;
-
-defm : T_WVR_pat <V6_vmpybus_acc, int_hexagon_V6_vmpybus_acc>;
-defm : T_WVR_pat <V6_vmpyhsat_acc, int_hexagon_V6_vmpyhsat_acc>;
-defm : T_WVR_pat <V6_vmpyuh_acc, int_hexagon_V6_vmpyuh_acc>;
-defm : T_WVR_pat <V6_vmpyub_acc, int_hexagon_V6_vmpyub_acc>;
-
-defm : T_WWR_pat <V6_vtmpyb_acc, int_hexagon_V6_vtmpyb_acc>;
-defm : T_WWR_pat <V6_vtmpybus_acc, int_hexagon_V6_vtmpybus_acc>;
-defm : T_WWR_pat <V6_vtmpyhb_acc, int_hexagon_V6_vtmpyhb_acc>;
-defm : T_WWR_pat <V6_vdmpybus_dv_acc, int_hexagon_V6_vdmpybus_dv_acc>;
-defm : T_WWR_pat <V6_vdmpyhb_dv_acc, int_hexagon_V6_vdmpyhb_dv_acc>;
-defm : T_WWR_pat <V6_vmpabus_acc, int_hexagon_V6_vmpabus_acc>;
-defm : T_WWR_pat <V6_vmpahb_acc, int_hexagon_V6_vmpahb_acc>;
-defm : T_WWR_pat <V6_vdsaduh_acc, int_hexagon_V6_vdsaduh_acc>;
-
-defm : T_VVV_pat <V6_vdmpyhvsat_acc, int_hexagon_V6_vdmpyhvsat_acc>;
-defm : T_WVV_pat <V6_vmpybusv_acc, int_hexagon_V6_vmpybusv_acc>;
-defm : T_WVV_pat <V6_vmpybv_acc, int_hexagon_V6_vmpybv_acc>;
-defm : T_WVV_pat <V6_vmpyhus_acc, int_hexagon_V6_vmpyhus_acc>;
-defm : T_WVV_pat <V6_vmpyhv_acc, int_hexagon_V6_vmpyhv_acc>;
-defm : T_VVV_pat <V6_vmpyiewh_acc, int_hexagon_V6_vmpyiewh_acc>;
-defm : T_VVV_pat <V6_vmpyiewuh_acc, int_hexagon_V6_vmpyiewuh_acc>;
-defm : T_VVV_pat <V6_vmpyih_acc, int_hexagon_V6_vmpyih_acc>;
-defm : T_VVV_pat <V6_vmpyowh_rnd_sacc, int_hexagon_V6_vmpyowh_rnd_sacc>;
-defm : T_VVV_pat <V6_vmpyowh_sacc, int_hexagon_V6_vmpyowh_sacc>;
-defm : T_WVV_pat <V6_vmpyubv_acc, int_hexagon_V6_vmpyubv_acc>;
-defm : T_WVV_pat <V6_vmpyuhv_acc, int_hexagon_V6_vmpyuhv_acc>;
-defm : T_VVV_pat <V6_vrmpybusv_acc, int_hexagon_V6_vrmpybusv_acc>;
-defm : T_VVV_pat <V6_vrmpybv_acc, int_hexagon_V6_vrmpybv_acc>;
-defm : T_VVV_pat <V6_vrmpyubv_acc, int_hexagon_V6_vrmpyubv_acc>;
-
-// Compare instructions
-defm : T_QVV_pat <V6_veqb_and, int_hexagon_V6_veqb_and>;
-defm : T_QVV_pat <V6_veqh_and, int_hexagon_V6_veqh_and>;
-defm : T_QVV_pat <V6_veqw_and, int_hexagon_V6_veqw_and>;
-defm : T_QVV_pat <V6_vgtb_and, int_hexagon_V6_vgtb_and>;
-defm : T_QVV_pat <V6_vgth_and, int_hexagon_V6_vgth_and>;
-defm : T_QVV_pat <V6_vgtw_and, int_hexagon_V6_vgtw_and>;
-defm : T_QVV_pat <V6_vgtub_and, int_hexagon_V6_vgtub_and>;
-defm : T_QVV_pat <V6_vgtuh_and, int_hexagon_V6_vgtuh_and>;
-defm : T_QVV_pat <V6_vgtuw_and, int_hexagon_V6_vgtuw_and>;
-defm : T_QVV_pat <V6_veqb_or, int_hexagon_V6_veqb_or>;
-defm : T_QVV_pat <V6_veqh_or, int_hexagon_V6_veqh_or>;
-defm : T_QVV_pat <V6_veqw_or, int_hexagon_V6_veqw_or>;
-defm : T_QVV_pat <V6_vgtb_or, int_hexagon_V6_vgtb_or>;
-defm : T_QVV_pat <V6_vgth_or, int_hexagon_V6_vgth_or>;
-defm : T_QVV_pat <V6_vgtw_or, int_hexagon_V6_vgtw_or>;
-defm : T_QVV_pat <V6_vgtub_or, int_hexagon_V6_vgtub_or>;
-defm : T_QVV_pat <V6_vgtuh_or, int_hexagon_V6_vgtuh_or>;
-defm : T_QVV_pat <V6_vgtuw_or, int_hexagon_V6_vgtuw_or>;
-defm : T_QVV_pat <V6_veqb_xor, int_hexagon_V6_veqb_xor>;
-defm : T_QVV_pat <V6_veqh_xor, int_hexagon_V6_veqh_xor>;
-defm : T_QVV_pat <V6_veqw_xor, int_hexagon_V6_veqw_xor>;
-defm : T_QVV_pat <V6_vgtb_xor, int_hexagon_V6_vgtb_xor>;
-defm : T_QVV_pat <V6_vgth_xor, int_hexagon_V6_vgth_xor>;
-defm : T_QVV_pat <V6_vgtw_xor, int_hexagon_V6_vgtw_xor>;
-defm : T_QVV_pat <V6_vgtub_xor, int_hexagon_V6_vgtub_xor>;
-defm : T_QVV_pat <V6_vgtuh_xor, int_hexagon_V6_vgtuh_xor>;
-defm : T_QVV_pat <V6_vgtuw_xor, int_hexagon_V6_vgtuw_xor>;
-
-defm : T_VV_pat <V6_vminub, int_hexagon_V6_vminub>;
-defm : T_VV_pat <V6_vminuh, int_hexagon_V6_vminuh>;
-defm : T_VV_pat <V6_vminh, int_hexagon_V6_vminh>;
-defm : T_VV_pat <V6_vminw, int_hexagon_V6_vminw>;
-defm : T_VV_pat <V6_vmaxub, int_hexagon_V6_vmaxub>;
-defm : T_VV_pat <V6_vmaxuh, int_hexagon_V6_vmaxuh>;
-defm : T_VV_pat <V6_vmaxh, int_hexagon_V6_vmaxh>;
-defm : T_VV_pat <V6_vmaxw, int_hexagon_V6_vmaxw>;
-defm : T_VV_pat <V6_vdelta, int_hexagon_V6_vdelta>;
-defm : T_VV_pat <V6_vrdelta, int_hexagon_V6_vrdelta>;
-defm : T_VV_pat <V6_vdealb4w, int_hexagon_V6_vdealb4w>;
-defm : T_VV_pat <V6_vmpyowh_rnd, int_hexagon_V6_vmpyowh_rnd>;
-defm : T_VV_pat <V6_vshuffeb, int_hexagon_V6_vshuffeb>;
-defm : T_VV_pat <V6_vshuffob, int_hexagon_V6_vshuffob>;
-defm : T_VV_pat <V6_vshufeh, int_hexagon_V6_vshufeh>;
-defm : T_VV_pat <V6_vshufoh, int_hexagon_V6_vshufoh>;
-defm : T_VV_pat <V6_vshufoeh, int_hexagon_V6_vshufoeh>;
-defm : T_VV_pat <V6_vshufoeb, int_hexagon_V6_vshufoeb>;
-defm : T_VV_pat <V6_vcombine, int_hexagon_V6_vcombine>;
-defm : T_VV_pat <V6_vmpyieoh, int_hexagon_V6_vmpyieoh>;
-defm : T_VV_pat <V6_vsathub, int_hexagon_V6_vsathub>;
-defm : T_VV_pat <V6_vsatwh, int_hexagon_V6_vsatwh>;
-defm : T_VV_pat <V6_vroundwh, int_hexagon_V6_vroundwh>;
-defm : T_VV_pat <V6_vroundwuh, int_hexagon_V6_vroundwuh>;
-defm : T_VV_pat <V6_vroundhb, int_hexagon_V6_vroundhb>;
-defm : T_VV_pat <V6_vroundhub, int_hexagon_V6_vroundhub>;
-defm : T_VV_pat <V6_vasrwv, int_hexagon_V6_vasrwv>;
-defm : T_VV_pat <V6_vlsrwv, int_hexagon_V6_vlsrwv>;
-defm : T_VV_pat <V6_vlsrhv, int_hexagon_V6_vlsrhv>;
-defm : T_VV_pat <V6_vasrhv, int_hexagon_V6_vasrhv>;
-defm : T_VV_pat <V6_vaslwv, int_hexagon_V6_vaslwv>;
-defm : T_VV_pat <V6_vaslhv, int_hexagon_V6_vaslhv>;
-defm : T_VV_pat <V6_vaddb, int_hexagon_V6_vaddb>;
-defm : T_VV_pat <V6_vaddh, int_hexagon_V6_vaddh>;
-defm : T_VV_pat <V6_vmpyiewuh, int_hexagon_V6_vmpyiewuh>;
-defm : T_VV_pat <V6_vmpyiowh, int_hexagon_V6_vmpyiowh>;
-defm : T_VV_pat <V6_vpackeb, int_hexagon_V6_vpackeb>;
-defm : T_VV_pat <V6_vpackeh, int_hexagon_V6_vpackeh>;
-defm : T_VV_pat <V6_vpackhub_sat, int_hexagon_V6_vpackhub_sat>;
-defm : T_VV_pat <V6_vpackhb_sat, int_hexagon_V6_vpackhb_sat>;
-defm : T_VV_pat <V6_vpackwuh_sat, int_hexagon_V6_vpackwuh_sat>;
-defm : T_VV_pat <V6_vpackwh_sat, int_hexagon_V6_vpackwh_sat>;
-defm : T_VV_pat <V6_vpackob, int_hexagon_V6_vpackob>;
-defm : T_VV_pat <V6_vpackoh, int_hexagon_V6_vpackoh>;
-defm : T_VV_pat <V6_vmpyewuh, int_hexagon_V6_vmpyewuh>;
-defm : T_VV_pat <V6_vmpyowh, int_hexagon_V6_vmpyowh>;
-
-defm : T_QVV_pat <V6_vaddbq, int_hexagon_V6_vaddbq>;
-defm : T_QVV_pat <V6_vaddhq, int_hexagon_V6_vaddhq>;
-defm : T_QVV_pat <V6_vaddwq, int_hexagon_V6_vaddwq>;
-defm : T_QVV_pat <V6_vaddbnq, int_hexagon_V6_vaddbnq>;
-defm : T_QVV_pat <V6_vaddhnq, int_hexagon_V6_vaddhnq>;
-defm : T_QVV_pat <V6_vaddwnq, int_hexagon_V6_vaddwnq>;
-defm : T_QVV_pat <V6_vsubbq, int_hexagon_V6_vsubbq>;
-defm : T_QVV_pat <V6_vsubhq, int_hexagon_V6_vsubhq>;
-defm : T_QVV_pat <V6_vsubwq, int_hexagon_V6_vsubwq>;
-defm : T_QVV_pat <V6_vsubbnq, int_hexagon_V6_vsubbnq>;
-defm : T_QVV_pat <V6_vsubhnq, int_hexagon_V6_vsubhnq>;
-defm : T_QVV_pat <V6_vsubwnq, int_hexagon_V6_vsubwnq>;
-
-defm : T_V_pat <V6_vabsh, int_hexagon_V6_vabsh>;
-defm : T_V_pat <V6_vabsw, int_hexagon_V6_vabsw>;
-defm : T_V_pat <V6_vabsw_sat, int_hexagon_V6_vabsw_sat>;
-defm : T_V_pat <V6_vabsh_sat, int_hexagon_V6_vabsh_sat>;
-defm : T_V_pat <V6_vnot, int_hexagon_V6_vnot>;
-defm : T_V_pat <V6_vassign, int_hexagon_V6_vassign>;
-defm : T_V_pat <V6_vzb, int_hexagon_V6_vzb>;
-defm : T_V_pat <V6_vzh, int_hexagon_V6_vzh>;
-defm : T_V_pat <V6_vsb, int_hexagon_V6_vsb>;
-defm : T_V_pat <V6_vsh, int_hexagon_V6_vsh>;
-defm : T_V_pat <V6_vdealh, int_hexagon_V6_vdealh>;
-defm : T_V_pat <V6_vdealb, int_hexagon_V6_vdealb>;
-defm : T_V_pat <V6_vunpackub, int_hexagon_V6_vunpackub>;
-defm : T_V_pat <V6_vunpackuh, int_hexagon_V6_vunpackuh>;
-defm : T_V_pat <V6_vunpackb, int_hexagon_V6_vunpackb>;
-defm : T_V_pat <V6_vunpackh, int_hexagon_V6_vunpackh>;
-defm : T_V_pat <V6_vshuffh, int_hexagon_V6_vshuffh>;
-defm : T_V_pat <V6_vshuffb, int_hexagon_V6_vshuffb>;
-defm : T_V_pat <V6_vcl0w, int_hexagon_V6_vcl0w>;
-defm : T_V_pat <V6_vpopcounth, int_hexagon_V6_vpopcounth>;
-defm : T_V_pat <V6_vcl0h, int_hexagon_V6_vcl0h>;
-defm : T_V_pat <V6_vnormamtw, int_hexagon_V6_vnormamtw>;
-defm : T_V_pat <V6_vnormamth, int_hexagon_V6_vnormamth>;
-
-defm : T_W_pat <V6_lo, int_hexagon_V6_lo>;
-defm : T_W_pat <V6_hi, int_hexagon_V6_hi>;
-defm : T_W_pat <V6_vassignp, int_hexagon_V6_vassignp>;
-
-defm : T_WRI_pat <V6_vrmpybusi, int_hexagon_V6_vrmpybusi>;
-defm : T_WRI_pat <V6_vrsadubi, int_hexagon_V6_vrsadubi>;
-defm : T_WRI_pat <V6_vrmpyubi, int_hexagon_V6_vrmpyubi>;
-
-defm : T_WWRI_pat <V6_vrmpybusi_acc, int_hexagon_V6_vrmpybusi_acc>;
-defm : T_WWRI_pat <V6_vrsadubi_acc, int_hexagon_V6_vrsadubi_acc>;
-defm : T_WWRI_pat <V6_vrmpyubi_acc, int_hexagon_V6_vrmpyubi_acc>;
-
-// assembler mapped.
-//defm : T_V_pat <V6_vtran2x2, int_hexagon_V6_vtran2x2>;
-// not present earlier.. need to add intrinsic
-defm : T_VVR_pat <V6_valignb, int_hexagon_V6_valignb>;
-defm : T_VVR_pat <V6_vlalignb, int_hexagon_V6_vlalignb>;
-defm : T_VVR_pat <V6_vasrwh, int_hexagon_V6_vasrwh>;
-defm : T_VVR_pat <V6_vasrwhsat, int_hexagon_V6_vasrwhsat>;
-defm : T_VVR_pat <V6_vasrwhrndsat, int_hexagon_V6_vasrwhrndsat>;
-defm : T_VVR_pat <V6_vasrwuhsat, int_hexagon_V6_vasrwuhsat>;
-defm : T_VVR_pat <V6_vasrhubsat, int_hexagon_V6_vasrhubsat>;
-defm : T_VVR_pat <V6_vasrhubrndsat, int_hexagon_V6_vasrhubrndsat>;
-defm : T_VVR_pat <V6_vasrhbrndsat, int_hexagon_V6_vasrhbrndsat>;
-
-defm : T_VVR_pat <V6_vshuffvdd, int_hexagon_V6_vshuffvdd>;
-defm : T_VVR_pat <V6_vdealvdd, int_hexagon_V6_vdealvdd>;
-
-defm : T_WV_pat <V6_vunpackob, int_hexagon_V6_vunpackob>;
-defm : T_WV_pat <V6_vunpackoh, int_hexagon_V6_vunpackoh>;
-defm : T_VVI_pat <V6_valignbi, int_hexagon_V6_valignbi>;
-defm : T_VVI_pat <V6_vlalignbi, int_hexagon_V6_vlalignbi>;
-
-defm : T_QVV_pat <V6_vswap, int_hexagon_V6_vswap>;
-defm : T_QVV_pat <V6_vmux, int_hexagon_V6_vmux>;
-defm : T_QQ_pat <V6_pred_and, int_hexagon_V6_pred_and>;
-defm : T_QQ_pat <V6_pred_or, int_hexagon_V6_pred_or>;
-defm : T_Q_pat <V6_pred_not, int_hexagon_V6_pred_not>;
-defm : T_QQ_pat <V6_pred_xor, int_hexagon_V6_pred_xor>;
-defm : T_QQ_pat <V6_pred_or_n, int_hexagon_V6_pred_or_n>;
-defm : T_QQ_pat <V6_pred_and_n, int_hexagon_V6_pred_and_n>;
-defm : T_VV_pat <V6_veqb, int_hexagon_V6_veqb>;
-defm : T_VV_pat <V6_veqh, int_hexagon_V6_veqh>;
-defm : T_VV_pat <V6_veqw, int_hexagon_V6_veqw>;
-defm : T_VV_pat <V6_vgtb, int_hexagon_V6_vgtb>;
-defm : T_VV_pat <V6_vgth, int_hexagon_V6_vgth>;
-defm : T_VV_pat <V6_vgtw, int_hexagon_V6_vgtw>;
-defm : T_VV_pat <V6_vgtub, int_hexagon_V6_vgtub>;
-defm : T_VV_pat <V6_vgtuh, int_hexagon_V6_vgtuh>;
-defm : T_VV_pat <V6_vgtuw, int_hexagon_V6_vgtuw>;
-
-defm : T_VQR_pat <V6_vandqrt_acc, int_hexagon_V6_vandqrt_acc>;
-defm : T_QVR_pat <V6_vandvrt_acc, int_hexagon_V6_vandvrt_acc>;
-defm : T_QR_pat <V6_vandqrt, int_hexagon_V6_vandqrt>;
-defm : T_R_pat <V6_lvsplatw, int_hexagon_V6_lvsplatw>;
-defm : T_R_pat <V6_pred_scalar2, int_hexagon_V6_pred_scalar2>;
-defm : T_VR_pat <V6_vandvrt, int_hexagon_V6_vandvrt>;
-
-defm : T_VVR_pat <V6_vlutvvb, int_hexagon_V6_vlutvvb>;
-defm : T_VVR_pat <V6_vlutvwh, int_hexagon_V6_vlutvwh>;
-defm : T_VVVR_pat <V6_vlutvvb_oracc, int_hexagon_V6_vlutvvb_oracc>;
-defm : T_WVVR_pat <V6_vlutvwh_oracc, int_hexagon_V6_vlutvwh_oracc>;
-
-defm : T_QVR_pat <V6_vandvrt_acc, int_hexagon_V6_vandvrt_acc>;
-def : T_PI_pat <S6_rol_i_p, int_hexagon_S6_rol_i_p>;
-def : T_RI_pat <S6_rol_i_r, int_hexagon_S6_rol_i_r>;
-def : T_PPI_pat <S6_rol_i_p_nac, int_hexagon_S6_rol_i_p_nac>;
-def : T_PPI_pat <S6_rol_i_p_acc, int_hexagon_S6_rol_i_p_acc>;
-def : T_PPI_pat <S6_rol_i_p_and, int_hexagon_S6_rol_i_p_and>;
-def : T_PPI_pat <S6_rol_i_p_or, int_hexagon_S6_rol_i_p_or>;
-def : T_PPI_pat <S6_rol_i_p_xacc, int_hexagon_S6_rol_i_p_xacc>;
-def : T_RRI_pat <S6_rol_i_r_nac, int_hexagon_S6_rol_i_r_nac>;
-def : T_RRI_pat <S6_rol_i_r_acc, int_hexagon_S6_rol_i_r_acc>;
-def : T_RRI_pat <S6_rol_i_r_and, int_hexagon_S6_rol_i_r_and>;
-def : T_RRI_pat <S6_rol_i_r_or, int_hexagon_S6_rol_i_r_or>;
-def : T_RRI_pat <S6_rol_i_r_xacc, int_hexagon_S6_rol_i_r_xacc>;
-
-defm : T_VR_pat <V6_extractw, int_hexagon_V6_extractw>;
-defm : T_VR_pat <V6_vinsertwr, int_hexagon_V6_vinsertwr>;
-
-//def : T_PPQ_pat <S2_cabacencbin, int_hexagon_S2_cabacencbin>;
-
-def: Pat<(v64i16 (trunc v64i32:$Vdd)),
- (v64i16 (V6_vpackwh_sat
- (v32i32 (V6_hi HvxWR:$Vdd)),
- (v32i32 (V6_lo HvxWR:$Vdd))))>;
-
-def: Pat<(int_hexagon_V6_vd0), (V6_vd0)>;
-def: Pat<(int_hexagon_V6_vd0_128B), (V6_vd0)>;
-
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index c2eb24b..c34eecd 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -38,7 +38,6 @@
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsHexagon.h"
#include "llvm/IR/Module.h"
diff --git a/llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td b/llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td
deleted file mode 100644
index 2fcefe6..0000000
--- a/llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td
+++ /dev/null
@@ -1,179 +0,0 @@
-//===--- HexagonMapAsm2IntrinV62.gen.td -----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-multiclass T_VR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, IntRegs:$src2),
- (MI HvxVR:$src1, IntRegs:$src2)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, IntRegs:$src2),
- (MI HvxVR:$src1, IntRegs:$src2)>;
-}
-
-multiclass T_VVL_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
- IntRegsLow8:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>;
-}
-
-multiclass T_VV_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxVR:$src2),
- (MI HvxVR:$src1, HvxVR:$src2)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2),
- (MI HvxVR:$src1, HvxVR:$src2)>;
-}
-
-multiclass T_WW_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, HvxWR:$src2),
- (MI HvxWR:$src1, HvxWR:$src2)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxWR:$src2),
- (MI HvxWR:$src1, HvxWR:$src2)>;
-}
-
-multiclass T_WVV_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
- (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2,
- HvxVR:$src3),
- (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>;
-}
-
-multiclass T_WR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, IntRegs:$src2),
- (MI HvxWR:$src1, IntRegs:$src2)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, IntRegs:$src2),
- (MI HvxWR:$src1, IntRegs:$src2)>;
-}
-
-multiclass T_WWR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
- (MI HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxWR:$src2,
- IntRegs:$src3),
- (MI HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_VVR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
- IntRegs:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_ZR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxQR:$src1, IntRegs:$src2),
- (MI HvxQR:$src1, IntRegs:$src2)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, IntRegs:$src2),
- (MI HvxQR:$src1, IntRegs:$src2)>;
-}
-
-multiclass T_VZR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxQR:$src2, IntRegs:$src3),
- (MI HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxQR:$src2,
- IntRegs:$src3),
- (MI HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_ZV_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxQR:$src1, HvxVR:$src2),
- (MI HvxQR:$src1, HvxVR:$src2)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, HvxVR:$src2),
- (MI HvxQR:$src1, HvxVR:$src2)>;
-}
-
-multiclass T_R_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID IntRegs:$src1),
- (MI IntRegs:$src1)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") IntRegs:$src1),
- (MI IntRegs:$src1)>;
-}
-
-multiclass T_ZZ_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxQR:$src1, HvxQR:$src2),
- (MI HvxQR:$src1, HvxQR:$src2)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, HvxQR:$src2),
- (MI HvxQR:$src1, HvxQR:$src2)>;
-}
-
-multiclass T_VVI_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, imm:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, imm:$src3)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
- imm:$src3),
- (MI HvxVR:$src1, HvxVR:$src2, imm:$src3)>;
-}
-
-multiclass T_VVVI_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4),
- (MI HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
- HvxVR:$src3, imm:$src4),
- (MI HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4)>;
-}
-
-multiclass T_WVVI_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
- def: Pat<(IntID HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4),
- (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4)>;
- def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2,
- HvxVR:$src3, imm:$src4),
- (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4)>;
-}
-
-def : T_R_pat <S6_vsplatrbp, int_hexagon_S6_vsplatrbp>;
-def : T_PP_pat <M6_vabsdiffb, int_hexagon_M6_vabsdiffb>;
-def : T_PP_pat <M6_vabsdiffub, int_hexagon_M6_vabsdiffub>;
-def : T_PP_pat <S6_vtrunehb_ppp, int_hexagon_S6_vtrunehb_ppp>;
-def : T_PP_pat <S6_vtrunohb_ppp, int_hexagon_S6_vtrunohb_ppp>;
-
-defm : T_VR_HVX_gen_pat <V6_vlsrb, int_hexagon_V6_vlsrb>;
-defm : T_VR_HVX_gen_pat <V6_vmpyiwub, int_hexagon_V6_vmpyiwub>;
-defm : T_VVL_HVX_gen_pat <V6_vasrwuhrndsat, int_hexagon_V6_vasrwuhrndsat>;
-defm : T_VVL_HVX_gen_pat <V6_vasruwuhrndsat, int_hexagon_V6_vasruwuhrndsat>;
-defm : T_VVL_HVX_gen_pat <V6_vasrhbsat, int_hexagon_V6_vasrhbsat>;
-defm : T_VVL_HVX_gen_pat <V6_vlutvvb_nm, int_hexagon_V6_vlutvvb_nm>;
-defm : T_VVL_HVX_gen_pat <V6_vlutvwh_nm, int_hexagon_V6_vlutvwh_nm>;
-defm : T_VV_HVX_gen_pat <V6_vrounduwuh, int_hexagon_V6_vrounduwuh>;
-defm : T_VV_HVX_gen_pat <V6_vrounduhub, int_hexagon_V6_vrounduhub>;
-defm : T_VV_HVX_gen_pat <V6_vadduwsat, int_hexagon_V6_vadduwsat>;
-defm : T_VV_HVX_gen_pat <V6_vsubuwsat, int_hexagon_V6_vsubuwsat>;
-defm : T_VV_HVX_gen_pat <V6_vaddbsat, int_hexagon_V6_vaddbsat>;
-defm : T_VV_HVX_gen_pat <V6_vsubbsat, int_hexagon_V6_vsubbsat>;
-defm : T_VV_HVX_gen_pat <V6_vaddububb_sat, int_hexagon_V6_vaddububb_sat>;
-defm : T_VV_HVX_gen_pat <V6_vsubububb_sat, int_hexagon_V6_vsubububb_sat>;
-defm : T_VV_HVX_gen_pat <V6_vmpyewuh_64, int_hexagon_V6_vmpyewuh_64>;
-defm : T_VV_HVX_gen_pat <V6_vmaxb, int_hexagon_V6_vmaxb>;
-defm : T_VV_HVX_gen_pat <V6_vminb, int_hexagon_V6_vminb>;
-defm : T_VV_HVX_gen_pat <V6_vsatuwuh, int_hexagon_V6_vsatuwuh>;
-defm : T_VV_HVX_gen_pat <V6_vaddclbw, int_hexagon_V6_vaddclbw>;
-defm : T_VV_HVX_gen_pat <V6_vaddclbh, int_hexagon_V6_vaddclbh>;
-defm : T_WW_HVX_gen_pat <V6_vadduwsat_dv, int_hexagon_V6_vadduwsat_dv>;
-defm : T_WW_HVX_gen_pat <V6_vsubuwsat_dv, int_hexagon_V6_vsubuwsat_dv>;
-defm : T_WW_HVX_gen_pat <V6_vaddbsat_dv, int_hexagon_V6_vaddbsat_dv>;
-defm : T_WW_HVX_gen_pat <V6_vsubbsat_dv, int_hexagon_V6_vsubbsat_dv>;
-defm : T_WVV_HVX_gen_pat <V6_vaddhw_acc, int_hexagon_V6_vaddhw_acc>;
-defm : T_WVV_HVX_gen_pat <V6_vadduhw_acc, int_hexagon_V6_vadduhw_acc>;
-defm : T_WVV_HVX_gen_pat <V6_vaddubh_acc, int_hexagon_V6_vaddubh_acc>;
-defm : T_WVV_HVX_gen_pat <V6_vmpyowh_64_acc, int_hexagon_V6_vmpyowh_64_acc>;
-defm : T_WR_HVX_gen_pat <V6_vmpauhb, int_hexagon_V6_vmpauhb>;
-defm : T_WWR_HVX_gen_pat <V6_vmpauhb_acc, int_hexagon_V6_vmpauhb_acc>;
-defm : T_VVR_HVX_gen_pat <V6_vmpyiwub_acc, int_hexagon_V6_vmpyiwub_acc>;
-defm : T_ZR_HVX_gen_pat <V6_vandnqrt, int_hexagon_V6_vandnqrt>;
-defm : T_VZR_HVX_gen_pat <V6_vandnqrt_acc, int_hexagon_V6_vandnqrt_acc>;
-defm : T_ZV_HVX_gen_pat <V6_vandvqv, int_hexagon_V6_vandvqv>;
-defm : T_ZV_HVX_gen_pat <V6_vandvnqv, int_hexagon_V6_vandvnqv>;
-defm : T_R_HVX_gen_pat <V6_pred_scalar2v2, int_hexagon_V6_pred_scalar2v2>;
-defm : T_R_HVX_gen_pat <V6_lvsplath, int_hexagon_V6_lvsplath>;
-defm : T_R_HVX_gen_pat <V6_lvsplatb, int_hexagon_V6_lvsplatb>;
-defm : T_ZZ_HVX_gen_pat <V6_shuffeqw, int_hexagon_V6_shuffeqw>;
-defm : T_ZZ_HVX_gen_pat <V6_shuffeqh, int_hexagon_V6_shuffeqh>;
-defm : T_VVI_HVX_gen_pat <V6_vlutvvbi, int_hexagon_V6_vlutvvbi>;
-defm : T_VVI_HVX_gen_pat <V6_vlutvwhi, int_hexagon_V6_vlutvwhi>;
-defm : T_VVVI_HVX_gen_pat <V6_vlutvvb_oracci, int_hexagon_V6_vlutvvb_oracci>;
-defm : T_WVVI_HVX_gen_pat <V6_vlutvwh_oracci, int_hexagon_V6_vlutvwh_oracci>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index a31fa57..e915a3c4 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2514,8 +2514,9 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
assert(ResTy.isVector());
unsigned NumElts = ResTy.getVectorNumElements();
- SDValue Vector = DAG.getUNDEF(ResTy);
- for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue Vector =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ResTy, Node->getOperand(0));
+ for (unsigned i = 1; i < NumElts; ++i) {
Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector,
Node->getOperand(i),
DAG.getConstant(i, DL, Subtarget.getGRLenVT()));
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index a0107e4..5096a8f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1651,18 +1651,20 @@ def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
(XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
(XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;
-def : Pat<(vector_insert v8f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm3:$imm),
- (XVINSGR2VR_W $vd, $rj, uimm3:$imm)>;
-def : Pat<(vector_insert v4f64:$vd, (f64 (bitconvert i64:$rj)), uimm2:$imm),
- (XVINSGR2VR_D $vd, $rj, uimm2:$imm)>;
+def : Pat<(vector_insert v8f32:$xd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm3:$imm),
+ (XVINSGR2VR_W $xd, $rj, uimm3:$imm)>;
+def : Pat<(vector_insert v4f64:$xd, (f64 (bitconvert i64:$rj)), uimm2:$imm),
+ (XVINSGR2VR_D $xd, $rj, uimm2:$imm)>;
def : Pat<(vector_insert v8f32:$xd, (f32 (vector_extract v8f32:$xj, uimm3:$imm1)), uimm3:$imm2),
(XVINSGR2VR_W $xd, (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm1), uimm3:$imm2)>;
def : Pat<(vector_insert v4f64:$xd, (f64 (vector_extract v4f64:$xj, uimm2:$imm1)), uimm2:$imm2),
(XVINSGR2VR_D $xd, (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm1), uimm2:$imm2)>;
+
+// XVINSVE0_{W/D}
def : Pat<(vector_insert v8f32:$xd, FPR32:$fj, uimm3:$imm),
- (XVINSGR2VR_W $xd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
+ (XVINSVE0_W $xd, (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), uimm3:$imm)>;
def : Pat<(vector_insert v4f64:$xd, FPR64:$fj, uimm2:$imm),
- (XVINSGR2VR_D $xd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;
+ (XVINSVE0_D $xd, (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), uimm2:$imm)>;
// scalar_to_vector
def : Pat<(v8f32 (scalar_to_vector FPR32:$fj)),
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 962e7c2..3c9defb 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -1842,10 +1842,19 @@ def : Pat<(vector_insert v4f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm2:$
(VINSGR2VR_W $vd, $rj, uimm2:$imm)>;
def : Pat<(vector_insert v2f64:$vd, (f64 (bitconvert i64:$rj)), uimm1:$imm),
(VINSGR2VR_D $vd, $rj, uimm1:$imm)>;
-def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, uimm2:$imm),
- (VINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm2:$imm)>;
-def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, uimm1:$imm),
- (VINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm1:$imm)>;
+
+// VEXTRINS_{W/D}
+foreach imm = 0...3 in {
+ defvar Imm = !shl(imm, 4);
+ def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, imm),
+ (VEXTRINS_W $vd, (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), Imm)>;
+}
+
+foreach imm = 0...1 in {
+ defvar Imm = !shl(imm, 4);
+ def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, imm),
+ (VEXTRINS_D $vd, (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), Imm)>;
+}
// scalar_to_vector
def : Pat<(v4f32 (scalar_to_vector FPR32:$fj)),
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp
index 03ce004..7cefb3f 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp
@@ -52,6 +52,9 @@ static ABI getTripleABI(const Triple &TT) {
bool Is64Bit = TT.isArch64Bit();
ABI TripleABI;
switch (TT.getEnvironment()) {
+ case llvm::Triple::EnvironmentType::UnknownEnvironment:
+ TripleABI = ABI_Unknown;
+ break;
case llvm::Triple::EnvironmentType::GNUSF:
case llvm::Triple::EnvironmentType::MuslSF:
TripleABI = Is64Bit ? ABI_LP64S : ABI_ILP32S;
@@ -96,7 +99,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
// 1. If the '-target-abi' is valid, use it.
if (IsABIValidForFeature(ArgProvidedABI)) {
- if (TT.hasEnvironment() && ArgProvidedABI != TripleABI)
+ if (IsABIValidForFeature(TripleABI) && ArgProvidedABI != TripleABI)
errs()
<< "warning: triple-implied ABI conflicts with provided target-abi '"
<< ABIName << "', using target-abi\n";
@@ -164,10 +167,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
return Is64Bit ? ABI_LP64F : ABI_ILP32F;
return Is64Bit ? ABI_LP64S : ABI_ILP32S;
};
- if (ABIName.empty())
- errs() << "warning: the triple-implied ABI is invalid, ignoring and using "
- "feature-implied ABI\n";
- else
+ if (!ABIName.empty())
errs() << "warning: both target-abi and the triple-implied ABI are "
"invalid, ignoring and using feature-implied ABI\n";
return checkABIStandardized(GetFeatureABI());
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index ad8f5f0..7abe9c9 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -385,11 +385,12 @@ void MipsELFObjectWriter::sortRelocs(std::vector<ELFRelocationEntry> &Relocs) {
if (hasRelocationAddend())
return;
- // Sort relocations by the address they are applied to.
- llvm::sort(Relocs,
- [](const ELFRelocationEntry &A, const ELFRelocationEntry &B) {
- return A.Offset < B.Offset;
- });
+ // Sort relocations by r_offset. There might be more than one at an offset
+ // with composed relocations or .reloc directives.
+ llvm::stable_sort(
+ Relocs, [](const ELFRelocationEntry &A, const ELFRelocationEntry &B) {
+ return A.Offset < B.Offset;
+ });
// Place relocations in a list for reorder convenience. Hi16 contains the
// iterators of high-part relocations.
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7883acc..f2c2f46 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2068,6 +2068,8 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL,
SelectionDAG &DAG,
unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
+ assert(A.getValueType() == MVT::i32 && B.getValueType() == MVT::i32 &&
+ Selector.getValueType() == MVT::i32 && "PRMT must have i32 operands");
return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32,
{A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)});
}
@@ -5872,6 +5874,8 @@ static SDValue combineADDRSPACECAST(SDNode *N,
// details:
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) {
+ assert(Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
+
if (Mode == NVPTX::PTXPrmtMode::NONE)
return Selector;
@@ -5903,6 +5907,8 @@ static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) {
}
static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) {
+ assert(A.getBitWidth() == 32 && B.getBitWidth() == 32 &&
+ Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
// {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
APInt BitField = B.concat(A);
APInt SelectorVal = getPRMTSelector(Selector, Mode);
@@ -6537,10 +6543,13 @@ static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known,
KnownBits BKnown = DAG.computeKnownBits(B, Depth);
// {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
+ assert(AKnown.getBitWidth() == 32 && BKnown.getBitWidth() == 32 &&
+ "PRMT must have i32 operands");
+ assert(Known.getBitWidth() == 32 && "PRMT must have i32 result");
KnownBits BitField = BKnown.concat(AKnown);
APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode);
- for (unsigned I : llvm::seq(std::min(4U, Known.getBitWidth() / 8))) {
+ for (unsigned I : llvm::seq(4)) {
APInt Sel = SelectorVal.extractBits(4, I * 4);
unsigned Idx = Sel.getLoBits(3).getZExtValue();
unsigned Sign = Sel.getHiBits(1).getZExtValue();
@@ -6564,3 +6573,102 @@ void NVPTXTargetLowering::computeKnownBitsForTargetNode(
break;
}
}
+
+static std::pair<APInt, APInt> getPRMTDemandedBits(const APInt &SelectorVal,
+ const APInt &DemandedBits) {
+ APInt DemandedLHS = APInt(32, 0);
+ APInt DemandedRHS = APInt(32, 0);
+
+ for (unsigned I : llvm::seq(4)) {
+ if (DemandedBits.extractBits(8, I * 8).isZero())
+ continue;
+
+ APInt Sel = SelectorVal.extractBits(4, I * 4);
+ unsigned Idx = Sel.getLoBits(3).getZExtValue();
+ unsigned Sign = Sel.getHiBits(1).getZExtValue();
+
+ APInt &Src = Idx < 4 ? DemandedLHS : DemandedRHS;
+ unsigned ByteStart = (Idx % 4) * 8;
+ if (Sign)
+ Src.setBit(ByteStart + 7);
+ else
+ Src.setBits(ByteStart, ByteStart + 8);
+ }
+
+ return {DemandedLHS, DemandedRHS};
+}
+
+// Replace undef with 0 as this is easier for other optimizations such as
+// known bits.
+static SDValue canonicalizePRMTInput(SDValue Op, SelectionDAG &DAG) {
+ if (!Op)
+ return SDValue();
+ if (Op.isUndef())
+ return DAG.getConstant(0, SDLoc(), MVT::i32);
+ return Op;
+}
+
+static SDValue simplifyDemandedBitsForPRMT(SDValue PRMT,
+ const APInt &DemandedBits,
+ SelectionDAG &DAG,
+ const TargetLowering &TLI,
+ unsigned Depth) {
+ assert(PRMT.getOpcode() == NVPTXISD::PRMT);
+ SDValue Op0 = PRMT.getOperand(0);
+ SDValue Op1 = PRMT.getOperand(1);
+ auto *SelectorConst = dyn_cast<ConstantSDNode>(PRMT.getOperand(2));
+ if (!SelectorConst)
+ return SDValue();
+
+ unsigned Mode = PRMT.getConstantOperandVal(3);
+ const APInt Selector = getPRMTSelector(SelectorConst->getAPIntValue(), Mode);
+
+ // Try to simplify the PRMT to one of the inputs if the used bytes are all
+ // from the same input in the correct order.
+ const unsigned LeadingBytes = DemandedBits.countLeadingZeros() / 8;
+ const unsigned SelBits = (4 - LeadingBytes) * 4;
+ if (Selector.getLoBits(SelBits) == APInt(32, 0x3210).getLoBits(SelBits))
+ return Op0;
+ if (Selector.getLoBits(SelBits) == APInt(32, 0x7654).getLoBits(SelBits))
+ return Op1;
+
+ auto [DemandedLHS, DemandedRHS] = getPRMTDemandedBits(Selector, DemandedBits);
+
+ // Attempt to avoid multi-use ops if we don't need anything from them.
+ SDValue DemandedOp0 =
+ TLI.SimplifyMultipleUseDemandedBits(Op0, DemandedLHS, DAG, Depth + 1);
+ SDValue DemandedOp1 =
+ TLI.SimplifyMultipleUseDemandedBits(Op1, DemandedRHS, DAG, Depth + 1);
+
+ DemandedOp0 = canonicalizePRMTInput(DemandedOp0, DAG);
+ DemandedOp1 = canonicalizePRMTInput(DemandedOp1, DAG);
+ if ((DemandedOp0 && DemandedOp0 != Op0) ||
+ (DemandedOp1 && DemandedOp1 != Op1)) {
+ Op0 = DemandedOp0 ? DemandedOp0 : Op0;
+ Op1 = DemandedOp1 ? DemandedOp1 : Op1;
+ return getPRMT(Op0, Op1, Selector.getZExtValue(), SDLoc(PRMT), DAG);
+ }
+
+ return SDValue();
+}
+
+bool NVPTXTargetLowering::SimplifyDemandedBitsForTargetNode(
+ SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+ KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const {
+ Known.resetAll();
+
+ switch (Op.getOpcode()) {
+ case NVPTXISD::PRMT:
+ if (SDValue Result = simplifyDemandedBitsForPRMT(Op, DemandedBits, TLO.DAG,
+ *this, Depth)) {
+ TLO.CombineTo(Op, Result);
+ return true;
+ }
+ break;
+ default:
+ break;
+ }
+
+ computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth);
+ return false;
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index bc3548c..228e2aa 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -275,6 +275,11 @@ public:
const APInt &DemandedElts,
const SelectionDAG &DAG,
unsigned Depth = 0) const override;
+ bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits,
+ const APInt &DemandedElts,
+ KnownBits &Known,
+ TargetLoweringOpt &TLO,
+ unsigned Depth = 0) const override;
private:
const NVPTXSubtarget &STI; // cache the subtarget here
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index 1ac91fa..80fac18 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -53,34 +53,30 @@ let Predicates = [IsISAFuture] in {
let Predicates = [HasVSX, IsISAFuture] in {
let mayLoad = 1 in {
- def LXVRL : XX1Form_memOp<31, 525, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB),
- "lxvrl $XT, $RA, $RB", IIC_LdStLoad, []>;
-
- def LXVRLL : XX1Form_memOp<31, 557, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB),
- "lxvrll $XT, $RA, $RB", IIC_LdStLoad, []>;
-
- def LXVPRL : XForm_XTp5_XAB5<31, 589, (outs vsrprc:$XTp),
- (ins memr:$RA, g8rc:$RB),
- "lxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>;
-
- def LXVPRLL : XForm_XTp5_XAB5<31, 621, (outs vsrprc:$XTp),
- (ins memr:$RA, g8rc:$RB),
- "lxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>;
+ def LXVRL
+ : XX1Form_memOp<31, 525, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB),
+ "lxvrl $XT, $RA, $RB", IIC_LdStLoad, []>;
+ def LXVRLL
+ : XX1Form_memOp<31, 557, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB),
+ "lxvrll $XT, $RA, $RB", IIC_LdStLoad, []>;
+ def LXVPRL
+ : XForm_XTp5_XAB5<31, 589, (outs vsrprc:$XTp), (ins memr:$RA, g8rc:$RB),
+ "lxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>;
+ def LXVPRLL
+ : XForm_XTp5_XAB5<31, 621, (outs vsrprc:$XTp), (ins memr:$RA, g8rc:$RB),
+ "lxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>;
}
let mayStore = 1 in {
- def STXVRL : XX1Form_memOp<31, 653, (outs),
- (ins vsrc:$XT, memr:$RA, g8rc:$RB),
- "stxvrl $XT, $RA, $RB", IIC_LdStLoad, []>;
-
- def STXVRLL : XX1Form_memOp<31, 685, (outs),
- (ins vsrc:$XT, memr:$RA, g8rc:$RB),
- "stxvrll $XT, $RA, $RB", IIC_LdStLoad, []>;
-
+ def STXVRL
+ : XX1Form_memOp<31, 653, (outs), (ins vsrc:$XT, memr:$RA, g8rc:$RB),
+ "stxvrl $XT, $RA, $RB", IIC_LdStLoad, []>;
+ def STXVRLL
+ : XX1Form_memOp<31, 685, (outs), (ins vsrc:$XT, memr:$RA, g8rc:$RB),
+ "stxvrll $XT, $RA, $RB", IIC_LdStLoad, []>;
def STXVPRL : XForm_XTp5_XAB5<31, 717, (outs),
(ins vsrprc:$XTp, memr:$RA, g8rc:$RB),
"stxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>;
-
def STXVPRLL : XForm_XTp5_XAB5<31, 749, (outs),
(ins vsrprc:$XTp, memr:$RA, g8rc:$RB),
"stxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>;
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index 75a0272..996b6ef 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -171,7 +171,7 @@ void PPCSubtarget::getCriticalPathRCs(RegClassVector &CriticalPathRCs) const {
}
void PPCSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const {
+ const SchedRegion &Region) const {
// The GenericScheduler that we use defaults to scheduling bottom up only.
// We want to schedule from both the top and the bottom and so we set
// OnlyBottomUp to false.
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index 9a97d1a..3c59a47 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -240,7 +240,8 @@ public:
void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override;
void overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const override;
+ const SchedRegion &Region) const override;
+
bool useAA() const override;
bool enableSubRegLiveness() const override;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
index aeda5ac..5abb546 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
@@ -52,15 +52,6 @@ namespace RISCV {
#include "RISCVGenSearchableTables.inc"
} // namespace RISCV
-// Report an error but don't ask the user to report a bug.
-// TODO: Remove these wrappers.
-[[noreturn]] static void reportError(const char *Reason) {
- reportFatalUsageError(Reason);
-}
-[[noreturn]] static void reportError(Error Err) {
- reportFatalUsageError(std::move(Err));
-}
-
namespace RISCVABI {
ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
StringRef ABIName) {
@@ -97,7 +88,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
if ((TargetABI == RISCVABI::ABI::ABI_ILP32E ||
(TargetABI == ABI_Unknown && IsRVE && !IsRV64)) &&
FeatureBits[RISCV::FeatureStdExtD])
- reportError("ILP32E cannot be used with the D ISA extension");
+ reportFatalUsageError("ILP32E cannot be used with the D ISA extension");
if (TargetABI != ABI_Unknown)
return TargetABI;
@@ -105,7 +96,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
// If no explicit ABI is given, try to compute the default ABI.
auto ISAInfo = RISCVFeatures::parseFeatureBits(IsRV64, FeatureBits);
if (!ISAInfo)
- reportError(ISAInfo.takeError());
+ reportFatalUsageError(ISAInfo.takeError());
return getTargetABI((*ISAInfo)->computeDefaultABI());
}
@@ -137,12 +128,12 @@ namespace RISCVFeatures {
void validate(const Triple &TT, const FeatureBitset &FeatureBits) {
if (TT.isArch64Bit() && !FeatureBits[RISCV::Feature64Bit])
- reportError("RV64 target requires an RV64 CPU");
+ reportFatalUsageError("RV64 target requires an RV64 CPU");
if (!TT.isArch64Bit() && !FeatureBits[RISCV::Feature32Bit])
- reportError("RV32 target requires an RV32 CPU");
+ reportFatalUsageError("RV32 target requires an RV32 CPU");
if (FeatureBits[RISCV::Feature32Bit] &&
FeatureBits[RISCV::Feature64Bit])
- reportError("RV32 and RV64 can't be combined");
+ reportFatalUsageError("RV32 and RV64 can't be combined");
}
llvm::Expected<std::unique_ptr<RISCVISAInfo>>
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
index baa508a..269b117 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
@@ -13,13 +13,7 @@
#include "MCTargetDesc/RISCVAsmBackend.h"
#include "MCTargetDesc/RISCVMCAsmInfo.h"
-#include "RISCVFixupKinds.h"
#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.td b/llvm/lib/Target/RISCV/RISCVCallingConv.td
index cbf039e..4c303a9 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.td
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.td
@@ -56,19 +56,21 @@ def CSR_XLEN_F32_Interrupt: CalleeSavedRegs<(add CSR_Interrupt,
def CSR_XLEN_F64_Interrupt: CalleeSavedRegs<(add CSR_Interrupt,
(sequence "F%u_D", 0, 31))>;
+defvar VREGS = (add (sequence "V%u", 0, 31),
+ (sequence "V%uM2", 0, 31, 2),
+ (sequence "V%uM4", 0, 31, 4),
+ (sequence "V%uM8", 0, 31, 8));
+
// Same as CSR_Interrupt, but including all vector registers.
-def CSR_XLEN_V_Interrupt: CalleeSavedRegs<(add CSR_Interrupt,
- (sequence "V%u", 0, 31))>;
+def CSR_XLEN_V_Interrupt: CalleeSavedRegs<(add CSR_Interrupt, VREGS)>;
// Same as CSR_Interrupt, but including all 32-bit FP registers and all vector
// registers.
-def CSR_XLEN_F32_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F32_Interrupt,
- (sequence "V%u", 0, 31))>;
+def CSR_XLEN_F32_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F32_Interrupt, VREGS)>;
// Same as CSR_Interrupt, but including all 64-bit FP registers and all vector
// registers.
-def CSR_XLEN_F64_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F64_Interrupt,
- (sequence "V%u", 0, 31))>;
+def CSR_XLEN_F64_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F64_Interrupt, VREGS)>;
// Same as CSR_Interrupt, but excluding X16-X31.
def CSR_Interrupt_RVE : CalleeSavedRegs<(sub CSR_Interrupt,
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index f9c0b54..171940e 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1272,7 +1272,7 @@ def FeatureVendorXSfmm128t
def FeatureVendorXSfvqmaccdod
: RISCVExtension<1, 0,
"SiFive Int8 Matrix Multiplication Instructions (2-by-8 and 8-by-2)",
- [FeatureStdExtZve32x]>;
+ [FeatureStdExtZve32x, FeatureStdExtZvl128b]>;
def HasVendorXSfvqmaccdod
: Predicate<"Subtarget->hasVendorXSfvqmaccdod()">,
AssemblerPredicate<(all_of FeatureVendorXSfvqmaccdod),
@@ -1281,7 +1281,7 @@ def HasVendorXSfvqmaccdod
def FeatureVendorXSfvqmaccqoq
: RISCVExtension<1, 0,
"SiFive Int8 Matrix Multiplication Instructions (4-by-8 and 8-by-4)",
- [FeatureStdExtZve32x]>;
+ [FeatureStdExtZve32x, FeatureStdExtZvl256b]>;
def HasVendorXSfvqmaccqoq
: Predicate<"Subtarget->hasVendorXSfvqmaccqoq()">,
AssemblerPredicate<(all_of FeatureVendorXSfvqmaccqoq),
@@ -1290,7 +1290,7 @@ def HasVendorXSfvqmaccqoq
def FeatureVendorXSfvfwmaccqqq
: RISCVExtension<1, 0,
"SiFive Matrix Multiply Accumulate Instruction (4-by-4)",
- [FeatureStdExtZvfbfmin]>;
+ [FeatureStdExtZvfbfmin, FeatureStdExtZvl128b]>;
def HasVendorXSfvfwmaccqqq
: Predicate<"Subtarget->hasVendorXSfvfwmaccqqq()">,
AssemblerPredicate<(all_of FeatureVendorXSfvfwmaccqqq),
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 23b4554..b1ab76a 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -1544,10 +1544,53 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
return Offset;
}
+static MCRegister getRVVBaseRegister(const RISCVRegisterInfo &TRI,
+ const Register &Reg) {
+ MCRegister BaseReg = TRI.getSubReg(Reg, RISCV::sub_vrm1_0);
+ // If it's not a grouped vector register, it doesn't have subregister, so
+ // the base register is just itself.
+ if (BaseReg == RISCV::NoRegister)
+ BaseReg = Reg;
+ return BaseReg;
+}
+
void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF,
BitVector &SavedRegs,
RegScavenger *RS) const {
TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+ // In TargetFrameLowering::determineCalleeSaves, any vector register is marked
+ // as saved if any of its subregister is clobbered, this is not correct in
+ // vector registers. We only want the vector register to be marked as saved
+ // if all of its subregisters are clobbered.
+ // For example:
+ // Original behavior: If v24 is marked, v24m2, v24m4, v24m8 are also marked.
+ // Correct behavior: v24m2 is marked only if v24 and v25 are marked.
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
+ const RISCVRegisterInfo &TRI = *STI.getRegisterInfo();
+ for (unsigned i = 0; CSRegs[i]; ++i) {
+ unsigned CSReg = CSRegs[i];
+ // Only vector registers need special care.
+ if (!RISCV::VRRegClass.contains(getRVVBaseRegister(TRI, CSReg)))
+ continue;
+
+ SavedRegs.reset(CSReg);
+
+ auto SubRegs = TRI.subregs(CSReg);
+ // Set the register and all its subregisters.
+ if (!MRI.def_empty(CSReg) || MRI.getUsedPhysRegsMask().test(CSReg)) {
+ SavedRegs.set(CSReg);
+ llvm::for_each(SubRegs, [&](unsigned Reg) { return SavedRegs.set(Reg); });
+ }
+
+ // Combine to super register if all of its subregisters are marked.
+ if (!SubRegs.empty() && llvm::all_of(SubRegs, [&](unsigned Reg) {
+ return SavedRegs.test(Reg);
+ }))
+ SavedRegs.set(CSReg);
+ }
+
// Unconditionally spill RA and FP only if the function uses a frame
// pointer.
if (hasFP(MF)) {
@@ -2137,16 +2180,6 @@ static unsigned getCalleeSavedRVVNumRegs(const Register &BaseReg) {
: 8;
}
-static MCRegister getRVVBaseRegister(const RISCVRegisterInfo &TRI,
- const Register &Reg) {
- MCRegister BaseReg = TRI.getSubReg(Reg, RISCV::sub_vrm1_0);
- // If it's not a grouped vector register, it doesn't have subregister, so
- // the base register is just itself.
- if (BaseReg == RISCV::NoRegister)
- BaseReg = Reg;
- return BaseReg;
-}
-
void RISCVFrameLowering::emitCalleeSavedRVVPrologCFI(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, bool HasFP) const {
MachineFunction *MF = MBB.getParent();
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index a541c2f..34910b7 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3032,6 +3032,63 @@ bool RISCVDAGToDAGISel::SelectAddrRegImmLsb00000(SDValue Addr, SDValue &Base,
return true;
}
+/// Return true if this a load/store that we have a RegRegScale instruction for.
+static bool isRegRegScaleLoadOrStore(SDNode *User, SDValue Add,
+ const RISCVSubtarget &Subtarget) {
+ if (User->getOpcode() != ISD::LOAD && User->getOpcode() != ISD::STORE)
+ return false;
+ EVT VT = cast<MemSDNode>(User)->getMemoryVT();
+ if (!(VT.isScalarInteger() &&
+ (Subtarget.hasVendorXTHeadMemIdx() || Subtarget.hasVendorXqcisls())) &&
+ !((VT == MVT::f32 || VT == MVT::f64) &&
+ Subtarget.hasVendorXTHeadFMemIdx()))
+ return false;
+ // Don't allow stores of the value. It must be used as the address.
+ if (User->getOpcode() == ISD::STORE &&
+ cast<StoreSDNode>(User)->getValue() == Add)
+ return false;
+
+ return true;
+}
+
+/// Is it profitable to fold this Add into RegRegScale load/store. If \p
+/// Shift is non-null, then we have matched a shl+add. We allow reassociating
+/// (add (add (shl A C2) B) C1) -> (add (add B C1) (shl A C2)) if there is a
+/// single addi and we don't have a SHXADD instruction we could use.
+/// FIXME: May still need to check how many and what kind of users the SHL has.
+static bool isWorthFoldingIntoRegRegScale(const RISCVSubtarget &Subtarget,
+ SDValue Add,
+ SDValue Shift = SDValue()) {
+ bool FoundADDI = false;
+ for (auto *User : Add->users()) {
+ if (isRegRegScaleLoadOrStore(User, Add, Subtarget))
+ continue;
+
+ // Allow a single ADDI that is used by loads/stores if we matched a shift.
+ if (!Shift || FoundADDI || User->getOpcode() != ISD::ADD ||
+ !isa<ConstantSDNode>(User->getOperand(1)) ||
+ !isInt<12>(cast<ConstantSDNode>(User->getOperand(1))->getSExtValue()))
+ return false;
+
+ FoundADDI = true;
+
+ // If we have a SHXADD instruction, prefer that over reassociating an ADDI.
+ assert(Shift.getOpcode() == ISD::SHL);
+ unsigned ShiftAmt = Shift.getConstantOperandVal(1);
+ if ((ShiftAmt <= 3 &&
+ (Subtarget.hasStdExtZba() || Subtarget.hasVendorXTHeadBa())) ||
+ (ShiftAmt >= 4 && ShiftAmt <= 7 && Subtarget.hasVendorXqciac()))
+ return false;
+
+ // All users of the ADDI should be load/store.
+ for (auto *ADDIUser : User->users())
+ if (!isRegRegScaleLoadOrStore(ADDIUser, SDValue(User, 0), Subtarget))
+ return false;
+ }
+
+ return true;
+}
+
bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr,
unsigned MaxShiftAmount,
SDValue &Base, SDValue &Index,
@@ -3062,7 +3119,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr,
if (LHS.getOpcode() == ISD::ADD &&
!isa<ConstantSDNode>(LHS.getOperand(1)) &&
isInt<12>(C1->getSExtValue())) {
- if (SelectShl(LHS.getOperand(1), Index, Scale)) {
+ if (SelectShl(LHS.getOperand(1), Index, Scale) &&
+ isWorthFoldingIntoRegRegScale(*Subtarget, LHS, LHS.getOperand(1))) {
SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(),
SDLoc(Addr), VT);
Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT,
@@ -3072,7 +3130,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr,
}
// Add is commutative so we need to check both operands.
- if (SelectShl(LHS.getOperand(0), Index, Scale)) {
+ if (SelectShl(LHS.getOperand(0), Index, Scale) &&
+ isWorthFoldingIntoRegRegScale(*Subtarget, LHS, LHS.getOperand(0))) {
SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(),
SDLoc(Addr), VT);
Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT,
@@ -3090,16 +3149,23 @@ bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr,
// Try to match a shift on the RHS.
if (SelectShl(RHS, Index, Scale)) {
+ if (!isWorthFoldingIntoRegRegScale(*Subtarget, Addr, RHS))
+ return false;
Base = LHS;
return true;
}
// Try to match a shift on the LHS.
if (SelectShl(LHS, Index, Scale)) {
+ if (!isWorthFoldingIntoRegRegScale(*Subtarget, Addr, LHS))
+ return false;
Base = RHS;
return true;
}
+ if (!isWorthFoldingIntoRegRegScale(*Subtarget, Addr))
+ return false;
+
Base = LHS;
Index = RHS;
Scale = CurDAG->getTargetConstant(0, SDLoc(Addr), VT);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 4845a9c..3918dd2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2319,6 +2319,10 @@ bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
if (getLegalZfaFPImm(Imm, VT) >= 0)
return true;
+ // Some constants can be produced by fli+fneg.
+ if (Imm.isNegative() && getLegalZfaFPImm(-Imm, VT) >= 0)
+ return true;
+
// Cannot create a 64 bit floating-point immediate value for rv32.
if (Subtarget.getXLen() < VT.getScalarSizeInBits()) {
// td can handle +0.0 or -0.0 already.
@@ -7936,7 +7940,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
BasePtr, MachinePointerInfo(Load->getAddressSpace()), Align(8));
OutChains.push_back(LoadVal.getValue(1));
Ret = DAG.getNode(RISCVISD::TUPLE_INSERT, DL, VT, Ret, LoadVal,
- DAG.getVectorIdxConstant(i, DL));
+ DAG.getTargetConstant(i, DL, MVT::i32));
BasePtr = DAG.getNode(ISD::ADD, DL, XLenVT, BasePtr, VROffset, Flag);
}
return DAG.getMergeValues(
@@ -8015,9 +8019,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
// Extract subregisters in a vector tuple and store them individually.
for (unsigned i = 0; i < NF; ++i) {
- auto Extract = DAG.getNode(RISCVISD::TUPLE_EXTRACT, DL,
- MVT::getScalableVectorVT(MVT::i8, NumElts),
- StoredVal, DAG.getVectorIdxConstant(i, DL));
+ auto Extract =
+ DAG.getNode(RISCVISD::TUPLE_EXTRACT, DL,
+ MVT::getScalableVectorVT(MVT::i8, NumElts), StoredVal,
+ DAG.getTargetConstant(i, DL, MVT::i32));
Ret = DAG.getStore(Chain, DL, Extract, BasePtr,
MachinePointerInfo(Store->getAddressSpace()),
Store->getBaseAlign(),
@@ -10934,9 +10939,9 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Load->getMemoryVT(), Load->getMemOperand());
SmallVector<SDValue, 9> Results;
for (unsigned int RetIdx = 0; RetIdx < NF; RetIdx++) {
- SDValue SubVec =
- DAG.getNode(RISCVISD::TUPLE_EXTRACT, DL, ContainerVT,
- Result.getValue(0), DAG.getVectorIdxConstant(RetIdx, DL));
+ SDValue SubVec = DAG.getNode(RISCVISD::TUPLE_EXTRACT, DL, ContainerVT,
+ Result.getValue(0),
+ DAG.getTargetConstant(RetIdx, DL, MVT::i32));
Results.push_back(convertFromScalableVector(VT, SubVec, DAG, Subtarget));
}
Results.push_back(Result.getValue(1));
@@ -11023,7 +11028,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal,
convertToScalableVector(
ContainerVT, FixedIntrinsic->getOperand(2 + i), DAG, Subtarget),
- DAG.getVectorIdxConstant(i, DL));
+ DAG.getTargetConstant(i, DL, MVT::i32));
SDValue Ops[] = {
FixedIntrinsic->getChain(),
@@ -12027,7 +12032,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
for (unsigned i = 0U; i < Factor; ++i)
Res[i] = DAG.getNode(RISCVISD::TUPLE_EXTRACT, DL, VecVT, Load,
- DAG.getVectorIdxConstant(i, DL));
+ DAG.getTargetConstant(i, DL, MVT::i32));
return DAG.getMergeValues(Res, DL);
}
@@ -12124,8 +12129,9 @@ SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op,
SDValue StoredVal = DAG.getUNDEF(VecTupTy);
for (unsigned i = 0; i < Factor; i++)
- StoredVal = DAG.getNode(RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal,
- Op.getOperand(i), DAG.getConstant(i, DL, XLenVT));
+ StoredVal =
+ DAG.getNode(RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal,
+ Op.getOperand(i), DAG.getTargetConstant(i, DL, MVT::i32));
SDValue Ops[] = {DAG.getEntryNode(),
DAG.getTargetConstant(IntrIds[Factor - 2], DL, XLenVT),
@@ -20690,7 +20696,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
SDValue Result = DAG.getUNDEF(VT);
for (unsigned i = 0; i < NF; ++i)
Result = DAG.getNode(RISCVISD::TUPLE_INSERT, DL, VT, Result, Splat,
- DAG.getVectorIdxConstant(i, DL));
+ DAG.getTargetConstant(i, DL, MVT::i32));
return Result;
}
// If this is a bitcast between a MVT::v4i1/v2i1/v1i1 and an illegal integer
@@ -24014,7 +24020,7 @@ bool RISCVTargetLowering::splitValueIntoRegisterParts(
#endif
Val = DAG.getNode(RISCVISD::TUPLE_INSERT, DL, PartVT, DAG.getUNDEF(PartVT),
- Val, DAG.getVectorIdxConstant(0, DL));
+ Val, DAG.getTargetConstant(0, DL, MVT::i32));
Parts[0] = Val;
return true;
}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index e0a8c07..f0447e0 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -434,7 +434,8 @@ public:
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
- bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ bool lowerInterleavedStore(Instruction *Store, Value *Mask,
+ ShuffleVectorInst *SVI,
unsigned Factor) const override;
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
@@ -444,9 +445,6 @@ public:
Instruction *Store, Value *Mask,
ArrayRef<Value *> InterleaveValues) const override;
- bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
- ArrayRef<Value *> InterleaveOps) const override;
-
bool supportKCFIBundles() const override { return true; }
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index aef410f..dd365cf 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -44,67 +44,86 @@ def simm10_unsigned : RISCVOp {
//===----------------------------------------------------------------------===//
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVPUnaryImm10<bits<7> funct7, string opcodestr,
- DAGOperand TyImm10 = simm10>
- : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), (ins TyImm10:$imm10),
- opcodestr, "$rd, $imm10"> {
+class PLI_i<bits<7> funct7, string opcodestr>
+ : RVInst<(outs GPR:$rd), (ins simm10:$imm10), opcodestr, "$rd, $imm10", [],
+ InstFormatOther> {
bits<10> imm10;
+ bits<5> rd;
let Inst{31-25} = funct7;
let Inst{24-16} = imm10{8-0};
let Inst{15} = imm10{9};
+ let Inst{14-12} = 0b010;
+ let Inst{11-7} = rd;
+ let Inst{6-0} = OPC_OP_IMM_32.Value;
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVPUnaryImm8<bits<8> funct8, string opcodestr>
- : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), (ins uimm8:$uimm8),
- opcodestr, "$rd, $uimm8"> {
+class PLUI_i<bits<7> funct7, string opcodestr>
+ : RVInst<(outs GPR:$rd), (ins simm10_unsigned:$imm10), opcodestr,
+ "$rd, $imm10", [], InstFormatOther> {
+ bits<10> imm10;
+ bits<5> rd;
+
+ let Inst{31-25} = funct7;
+ let Inst{24} = imm10{0};
+ let Inst{23-15} = imm10{9-1};
+ let Inst{14-12} = 0b010;
+ let Inst{11-7} = rd;
+ let Inst{6-0} = OPC_OP_IMM_32.Value;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class PLI_B_i<bits<8> funct8, string opcodestr>
+ : RVInst<(outs GPR:$rd), (ins uimm8:$uimm8), opcodestr, "$rd, $uimm8", [],
+ InstFormatOther> {
bits<8> uimm8;
+ bits<5> rd;
let Inst{31-24} = funct8;
let Inst{23-16} = uimm8;
let Inst{15} = 0b0;
+ let Inst{14-12} = 0b010;
+ let Inst{11-7} = rd;
+ let Inst{6-0} = OPC_OP_IMM_32.Value;
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVPUnary<bits<3> f, string opcodestr, dag operands, string argstr>
- : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), operands, opcodestr, argstr> {
- bits<5> imm;
- bits<5> rs1;
-
+class RVPShift_ri<bits<3> f, bits<3> funct3, string opcodestr, Operand ImmType>
+ : RVInstIBase<funct3, OPC_OP_IMM_32, (outs GPR:$rd),
+ (ins GPR:$rs1, ImmType:$shamt), opcodestr,
+ "$rd, $rs1, $shamt"> {
let Inst{31} = 0b1;
let Inst{30-28} = f;
let Inst{27} = 0b0;
- let Inst{19-15} = rs1;
}
-class RVPUnaryImm5<bits<3> f, string opcodestr>
- : RVPUnary<f, opcodestr, (ins GPR:$rs1, uimm5:$uimm5), "$rd, $rs1, $uimm5"> {
- bits<5> uimm5;
+class RVPShiftW_ri<bits<3> f, bits<3> funct3, string opcodestr>
+ : RVPShift_ri<f, funct3, opcodestr, uimm5> {
+ bits<5> shamt;
- let imm = uimm5;
let Inst{26-25} = 0b01;
- let Inst{24-20} = uimm5;
+ let Inst{24-20} = shamt;
}
-class RVPUnaryImm4<bits<3> f, string opcodestr>
- : RVPUnary<f, opcodestr, (ins GPR:$rs1, uimm4:$uimm4), "$rd, $rs1, $uimm4"> {
- bits<4> uimm4;
+class RVPShiftH_ri<bits<3> f, bits<3> funct3, string opcodestr>
+ : RVPShift_ri<f, funct3, opcodestr, uimm4> {
+ bits<4> shamt;
let Inst{26-24} = 0b001;
- let Inst{23-20} = uimm4;
+ let Inst{23-20} = shamt;
}
-class RVPUnaryImm3<bits<3> f, string opcodestr>
- : RVPUnary<f, opcodestr, (ins GPR:$rs1, uimm3:$uimm3), "$rd, $rs1, $uimm3"> {
- bits<3> uimm3;
+class RVPShiftB_ri<bits<3> f, bits<3> funct3, string opcodestr>
+ : RVPShift_ri<f, funct3, opcodestr, uimm3> {
+ bits<3> shamt;
let Inst{26-23} = 0b0001;
- let Inst{22-20} = uimm3;
+ let Inst{22-20} = shamt;
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVPUnaryWUF<bits<2> w, bits<5> uf, string opcodestr>
+class RVPUnary_ri<bits<2> w, bits<5> uf, string opcodestr>
: RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), (ins GPR:$rs1),
opcodestr, "$rd, $rs1"> {
let Inst{31-27} = 0b11100;
@@ -132,36 +151,36 @@ def ABSW : UnaryW_r<0b011000000111, 0b001, "absw">;
} // Predicates = [HasStdExtP, IsRV64]
let Predicates = [HasStdExtP] in {
-def PSLLI_B : RVPUnaryImm3<0b000, "pslli.b">;
-def PSLLI_H : RVPUnaryImm4<0b000, "pslli.h">;
-def PSSLAI_H : RVPUnaryImm4<0b101, "psslai.h">;
+def PSLLI_B : RVPShiftB_ri<0b000, 0b010, "pslli.b">;
+def PSLLI_H : RVPShiftH_ri<0b000, 0b010, "pslli.h">;
+def PSSLAI_H : RVPShiftH_ri<0b101, 0b010, "psslai.h">;
} // Predicates = [HasStdExtP]
let DecoderNamespace = "RV32Only",
Predicates = [HasStdExtP, IsRV32] in
-def SSLAI : RVPUnaryImm5<0b101, "sslai">;
+def SSLAI : RVPShiftW_ri<0b101, 0b010, "sslai">;
let Predicates = [HasStdExtP, IsRV64] in {
-def PSLLI_W : RVPUnaryImm5<0b000, "pslli.w">;
-def PSSLAI_W : RVPUnaryImm5<0b101, "psslai.w">;
+def PSLLI_W : RVPShiftW_ri<0b000, 0b010, "pslli.w">;
+def PSSLAI_W : RVPShiftW_ri<0b101, 0b010, "psslai.w">;
} // Predicates = [HasStdExtP, IsRV64]
let Predicates = [HasStdExtP] in
-def PLI_H : RVPUnaryImm10<0b1011000, "pli.h">;
+def PLI_H : PLI_i<0b1011000, "pli.h">;
let Predicates = [HasStdExtP, IsRV64] in
-def PLI_W : RVPUnaryImm10<0b1011001, "pli.w">;
+def PLI_W : PLI_i<0b1011001, "pli.w">;
let Predicates = [HasStdExtP] in
-def PLI_B : RVPUnaryImm8<0b10110100, "pli.b">;
+def PLI_B : PLI_B_i<0b10110100, "pli.b">;
let Predicates = [HasStdExtP] in {
-def PSEXT_H_B : RVPUnaryWUF<0b00, 0b00100, "psext.h.b">;
-def PSABS_H : RVPUnaryWUF<0b00, 0b00111, "psabs.h">;
-def PSABS_B : RVPUnaryWUF<0b10, 0b00111, "psabs.b">;
+def PSEXT_H_B : RVPUnary_ri<0b00, 0b00100, "psext.h.b">;
+def PSABS_H : RVPUnary_ri<0b00, 0b00111, "psabs.h">;
+def PSABS_B : RVPUnary_ri<0b10, 0b00111, "psabs.b">;
} // Predicates = [HasStdExtP]
let Predicates = [HasStdExtP, IsRV64] in {
-def PSEXT_W_B : RVPUnaryWUF<0b01, 0b00100, "psext.w.b">;
-def PSEXT_W_H : RVPUnaryWUF<0b01, 0b00101, "psext.w.h">;
+def PSEXT_W_B : RVPUnary_ri<0b01, 0b00100, "psext.w.b">;
+def PSEXT_W_H : RVPUnary_ri<0b01, 0b00101, "psext.w.h">;
} // Predicates = [HasStdExtP, IsRV64]
let Predicates = [HasStdExtP] in
-def PLUI_H : RVPUnaryImm10<0b1111000, "plui.h", simm10_unsigned>;
+def PLUI_H : PLUI_i<0b1111000, "plui.h">;
let Predicates = [HasStdExtP, IsRV64] in
-def PLUI_W : RVPUnaryImm10<0b1111001, "plui.w", simm10_unsigned>;
+def PLUI_W : PLUI_i<0b1111001, "plui.w">;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index de9e55b..dfa532a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -543,7 +543,8 @@ defset list<VTypeInfoToWide> AllWidenableBFloatToFloatVectors = {
// This represents the information we need in codegen for each pseudo.
// The definition should be consistent with `struct PseudoInfo` in
// RISCVInstrInfo.h.
-class RISCVVPseudo {
+class RISCVVPseudo<dag outs, dag ins, list<dag> pattern = [], string opcodestr = "", string argstr = "">
+ : Pseudo<outs, ins, pattern, opcodestr, argstr> {
Pseudo Pseudo = !cast<Pseudo>(NAME); // Used as a key.
Instruction BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
// SEW = 0 is used to denote that the Pseudo is not SEW specific (or unknown).
@@ -785,10 +786,9 @@ class GetVTypeMinimalPredicates<VTypeInfo vti> {
class VPseudoUSLoadNoMask<VReg RetClass,
int EEW,
DAGOperand sewop = sew> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$vl, sewop:$sew,
- vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$vl,
+ sewop:$sew, vec_policy:$policy), []>,
RISCVVLE</*Masked*/0, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -801,11 +801,10 @@ class VPseudoUSLoadNoMask<VReg RetClass,
class VPseudoUSLoadMask<VReg RetClass,
int EEW> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru,
- GPRMemZeroOffset:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$vl, sew:$sew,
+ vec_policy:$policy), []>,
RISCVVLE</*Masked*/1, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -820,10 +819,9 @@ class VPseudoUSLoadMask<VReg RetClass,
class VPseudoUSLoadFFNoMask<VReg RetClass,
int EEW> :
- Pseudo<(outs RetClass:$rd, GPR:$vl),
- (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$avl,
- sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs RetClass:$rd, GPR:$vl),
+ (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$avl,
+ sew:$sew, vec_policy:$policy), []>,
RISCVVLE</*Masked*/0, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -836,11 +834,10 @@ class VPseudoUSLoadFFNoMask<VReg RetClass,
class VPseudoUSLoadFFMask<VReg RetClass,
int EEW> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl),
- (ins GetVRegNoV0<RetClass>.R:$passthru,
- GPRMemZeroOffset:$rs1,
- VMaskOp:$vm, AVL:$avl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$avl, sew:$sew,
+ vec_policy:$policy), []>,
RISCVVLE</*Masked*/1, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -855,10 +852,9 @@ class VPseudoUSLoadFFMask<VReg RetClass,
class VPseudoSLoadNoMask<VReg RetClass,
int EEW> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$dest, GPRMemZeroOffset:$rs1, GPR:$rs2, AVL:$vl,
- sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$dest, GPRMemZeroOffset:$rs1, GPR:$rs2,
+ AVL:$vl, sew:$sew, vec_policy:$policy), []>,
RISCVVLE</*Masked*/0, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -871,11 +867,10 @@ class VPseudoSLoadNoMask<VReg RetClass,
class VPseudoSLoadMask<VReg RetClass,
int EEW> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru,
- GPRMemZeroOffset:$rs1, GPR:$rs2,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ GPRMemZeroOffset:$rs1, GPR:$rs2, VMaskOp:$vm, AVL:$vl,
+ sew:$sew, vec_policy:$policy), []>,
RISCVVLE</*Masked*/1, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -895,10 +890,9 @@ class VPseudoILoadNoMask<VReg RetClass,
bit Ordered,
bit EarlyClobber,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$dest, GPRMemZeroOffset:$rs1, IdxClass:$rs2, AVL:$vl,
- sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$dest, GPRMemZeroOffset:$rs1, IdxClass:$rs2,
+ AVL:$vl, sew:$sew, vec_policy:$policy), []>,
RISCVVLX</*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 1;
let mayStore = 0;
@@ -917,11 +911,10 @@ class VPseudoILoadMask<VReg RetClass,
bit Ordered,
bit EarlyClobber,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru,
- GPRMemZeroOffset:$rs1, IdxClass:$rs2,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ GPRMemZeroOffset:$rs1, IdxClass:$rs2, VMaskOp:$vm,
+ AVL:$vl, sew:$sew, vec_policy:$policy), []>,
RISCVVLX</*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 1;
let mayStore = 0;
@@ -938,9 +931,9 @@ class VPseudoILoadMask<VReg RetClass,
class VPseudoUSStoreNoMask<VReg StClass,
int EEW,
DAGOperand sewop = sew> :
- Pseudo<(outs),
- (ins StClass:$rd, GPRMemZeroOffset:$rs1, AVL:$vl, sewop:$sew), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins StClass:$rd, GPRMemZeroOffset:$rs1, AVL:$vl,
+ sewop:$sew), []>,
RISCVVSE</*Masked*/0, /*Strided*/0, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -951,10 +944,9 @@ class VPseudoUSStoreNoMask<VReg StClass,
class VPseudoUSStoreMask<VReg StClass,
int EEW> :
- Pseudo<(outs),
- (ins StClass:$rd, GPRMemZeroOffset:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins StClass:$rd, GPRMemZeroOffset:$rs1,
+ VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
RISCVVSE</*Masked*/1, /*Strided*/0, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -966,10 +958,9 @@ class VPseudoUSStoreMask<VReg StClass,
class VPseudoSStoreNoMask<VReg StClass,
int EEW> :
- Pseudo<(outs),
- (ins StClass:$rd, GPRMemZeroOffset:$rs1, GPR:$rs2,
- AVL:$vl, sew:$sew), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins StClass:$rd, GPRMemZeroOffset:$rs1, GPR:$rs2,
+ AVL:$vl, sew:$sew), []>,
RISCVVSE</*Masked*/0, /*Strided*/1, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -980,10 +971,9 @@ class VPseudoSStoreNoMask<VReg StClass,
class VPseudoSStoreMask<VReg StClass,
int EEW> :
- Pseudo<(outs),
- (ins StClass:$rd, GPRMemZeroOffset:$rs1, GPR:$rs2,
- VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins StClass:$rd, GPRMemZeroOffset:$rs1, GPR:$rs2,
+ VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
RISCVVSE</*Masked*/1, /*Strided*/1, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -994,10 +984,9 @@ class VPseudoSStoreMask<VReg StClass,
}
class VPseudoNullaryNoMask<VReg RegClass> :
- Pseudo<(outs RegClass:$rd),
- (ins RegClass:$passthru,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RegClass:$rd),
+ (ins RegClass:$passthru,
+ AVL:$vl, sew:$sew, vec_policy:$policy), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1008,10 +997,10 @@ class VPseudoNullaryNoMask<VReg RegClass> :
}
class VPseudoNullaryMask<VReg RegClass> :
- Pseudo<(outs GetVRegNoV0<RegClass>.R:$rd),
- (ins GetVRegNoV0<RegClass>.R:$passthru,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs GetVRegNoV0<RegClass>.R:$rd),
+ (ins GetVRegNoV0<RegClass>.R:$passthru,
+ VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy),
+ []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1026,8 +1015,7 @@ class VPseudoNullaryMask<VReg RegClass> :
// Nullary for pseudo instructions. They are expanded in
// RISCVExpandPseudoInsts pass.
class VPseudoNullaryPseudoM<string BaseInst> :
- Pseudo<(outs VR:$rd), (ins AVL:$vl, sew_mask:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs VR:$rd), (ins AVL:$vl, sew_mask:$sew), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1041,10 +1029,9 @@ class VPseudoUnaryNoMask<DAGOperand RetClass,
DAGOperand OpClass,
string Constraint = "",
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$passthru, OpClass:$rs2,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$passthru, OpClass:$rs2,
+ AVL:$vl, sew:$sew, vec_policy:$policy), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1059,9 +1046,8 @@ class VPseudoUnaryNoMaskNoPolicy<DAGOperand RetClass,
DAGOperand OpClass,
string Constraint = "",
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins OpClass:$rs2, AVL:$vl, sew_mask:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins OpClass:$rs2, AVL:$vl, sew_mask:$sew), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1075,10 +1061,9 @@ class VPseudoUnaryNoMaskRoundingMode<DAGOperand RetClass,
DAGOperand OpClass,
string Constraint = "",
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$passthru, OpClass:$rs2, vec_rm:$rm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$passthru, OpClass:$rs2, vec_rm:$rm,
+ AVL:$vl, sew:$sew, vec_policy:$policy), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1097,10 +1082,9 @@ class VPseudoUnaryMask<VReg RetClass,
string Constraint = "",
bits<2> TargetConstraintType = 1,
DAGOperand sewop = sew> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2,
- VMaskOp:$vm, AVL:$vl, sewop:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2,
+ VMaskOp:$vm, AVL:$vl, sewop:$sew, vec_policy:$policy), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1117,11 +1101,10 @@ class VPseudoUnaryMaskRoundingMode<VReg RetClass,
VReg OpClass,
string Constraint = "",
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2,
- VMaskOp:$vm, vec_rm:$rm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2,
+ VMaskOp:$vm, vec_rm:$rm,
+ AVL:$vl, sew:$sew, vec_policy:$policy), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1155,9 +1138,8 @@ class VPseudoUnaryMask_NoExcept<VReg RetClass,
}
class VPseudoUnaryNoMaskGPROut :
- Pseudo<(outs GPR:$rd),
- (ins VR:$rs2, AVL:$vl, sew_mask:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs GPR:$rd),
+ (ins VR:$rs2, AVL:$vl, sew_mask:$sew), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1166,9 +1148,8 @@ class VPseudoUnaryNoMaskGPROut :
}
class VPseudoUnaryMaskGPROut :
- Pseudo<(outs GPR:$rd),
- (ins VR:$rs1, VMaskOp:$vm, AVL:$vl, sew_mask:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs GPR:$rd),
+ (ins VR:$rs1, VMaskOp:$vm, AVL:$vl, sew_mask:$sew), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1180,10 +1161,9 @@ class VPseudoUnaryMaskGPROut :
// Mask can be V0~V31
class VPseudoUnaryAnyMask<VReg RetClass,
VReg Op1Class> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$passthru, Op1Class:$rs2,
- VR:$vm, AVL:$vl, sew:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$passthru, Op1Class:$rs2,
+ VR:$vm, AVL:$vl, sew:$sew), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1198,9 +1178,9 @@ class VPseudoBinaryNoMask<VReg RetClass,
string Constraint,
bits<2> TargetConstraintType = 1,
DAGOperand sewop = sew> :
- Pseudo<(outs RetClass:$rd),
- (ins Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, sewop:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, sewop:$sew),
+ []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1215,10 +1195,9 @@ class VPseudoBinaryNoMaskPolicy<VReg RetClass,
DAGOperand Op2Class,
string Constraint,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, AVL:$vl,
- sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1,
+ AVL:$vl, sew:$sew, vec_policy:$policy), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1235,10 +1214,10 @@ class VPseudoBinaryNoMaskRoundingMode<VReg RetClass,
string Constraint,
bit UsesVXRM_ = 1,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, vec_rm:$rm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1,
+ vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy),
+ []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1258,12 +1237,11 @@ class VPseudoBinaryMaskPolicyRoundingMode<VReg RetClass,
string Constraint,
bit UsesVXRM_,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru,
- Op1Class:$rs2, Op2Class:$rs1,
- VMaskOp:$vm, vec_rm:$rm, AVL:$vl,
- sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ Op1Class:$rs2, Op2Class:$rs1,
+ VMaskOp:$vm, vec_rm:$rm, AVL:$vl,
+ sew:$sew, vec_policy:$policy), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1286,10 +1264,9 @@ class VPseudoTiedBinaryNoMask<VReg RetClass,
DAGOperand Op2Class,
string Constraint,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$rs2, Op2Class:$rs1, AVL:$vl, sew:$sew,
- vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$rs2, Op2Class:$rs1, AVL:$vl, sew:$sew,
+ vec_policy:$policy), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1307,12 +1284,11 @@ class VPseudoTiedBinaryNoMaskRoundingMode<VReg RetClass,
DAGOperand Op2Class,
string Constraint,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$rs2, Op2Class:$rs1,
- vec_rm:$rm,
- AVL:$vl, sew:$sew,
- vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$rs2, Op2Class:$rs1,
+ vec_rm:$rm,
+ AVL:$vl, sew:$sew,
+ vec_policy:$policy), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1331,10 +1307,9 @@ class VPseudoTiedBinaryNoMaskRoundingMode<VReg RetClass,
class VPseudoIStoreNoMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL,
bit Ordered>:
- Pseudo<(outs),
- (ins StClass:$rd, GPRMemZeroOffset:$rs1, IdxClass:$rs2, AVL:$vl,
- sew:$sew),[]>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins StClass:$rd, GPRMemZeroOffset:$rs1, IdxClass:$rs2,
+ AVL:$vl, sew:$sew),[]>,
RISCVVSX</*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 0;
let mayStore = 1;
@@ -1345,10 +1320,9 @@ class VPseudoIStoreNoMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL,
class VPseudoIStoreMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL,
bit Ordered>:
- Pseudo<(outs),
- (ins StClass:$rd, GPRMemZeroOffset:$rs1, IdxClass:$rs2,
- VMaskOp:$vm, AVL:$vl, sew:$sew),[]>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins StClass:$rd, GPRMemZeroOffset:$rs1, IdxClass:$rs2,
+ VMaskOp:$vm, AVL:$vl, sew:$sew),[]>,
RISCVVSX</*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 0;
let mayStore = 1;
@@ -1363,11 +1337,11 @@ class VPseudoBinaryMaskPolicy<VReg RetClass,
DAGOperand Op2Class,
string Constraint,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru,
- Op1Class:$rs2, Op2Class:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ Op1Class:$rs2, Op2Class:$rs1,
+ VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy),
+ []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1383,11 +1357,11 @@ class VPseudoBinaryMaskPolicy<VReg RetClass,
class VPseudoTernaryMaskPolicy<VReg RetClass,
RegisterClass Op1Class,
DAGOperand Op2Class> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru,
- Op1Class:$rs2, Op2Class:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ Op1Class:$rs2, Op2Class:$rs1,
+ VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy),
+ []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1401,13 +1375,12 @@ class VPseudoTernaryMaskPolicy<VReg RetClass,
class VPseudoTernaryMaskPolicyRoundingMode<VReg RetClass,
RegisterClass Op1Class,
DAGOperand Op2Class> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru,
- Op1Class:$rs2, Op2Class:$rs1,
- VMaskOp:$vm,
- vec_rm:$rm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ Op1Class:$rs2, Op2Class:$rs1,
+ VMaskOp:$vm,
+ vec_rm:$rm,
+ AVL:$vl, sew:$sew, vec_policy:$policy), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1427,11 +1400,11 @@ class VPseudoBinaryMOutMask<VReg RetClass,
DAGOperand Op2Class,
string Constraint,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$passthru,
- Op1Class:$rs2, Op2Class:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$passthru,
+ Op1Class:$rs2, Op2Class:$rs1,
+ VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy),
+ []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1451,11 +1424,11 @@ class VPseudoTiedBinaryMask<VReg RetClass,
DAGOperand Op2Class,
string Constraint,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru,
- Op2Class:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ Op2Class:$rs1,
+ VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy),
+ []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1473,13 +1446,12 @@ class VPseudoTiedBinaryMaskRoundingMode<VReg RetClass,
DAGOperand Op2Class,
string Constraint,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru,
- Op2Class:$rs1,
- VMaskOp:$vm,
- vec_rm:$rm,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ Op2Class:$rs1,
+ VMaskOp:$vm,
+ vec_rm:$rm,
+ AVL:$vl, sew:$sew, vec_policy:$policy), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1503,13 +1475,12 @@ class VPseudoBinaryCarry<VReg RetClass,
bit CarryIn,
string Constraint,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- !if(CarryIn,
- (ins Op1Class:$rs2, Op2Class:$rs1,
- VMV0:$carry, AVL:$vl, sew:$sew),
- (ins Op1Class:$rs2, Op2Class:$rs1,
- AVL:$vl, sew:$sew)), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ !if(CarryIn,
+ (ins Op1Class:$rs2, Op2Class:$rs1,
+ VMV0:$carry, AVL:$vl, sew:$sew),
+ (ins Op1Class:$rs2, Op2Class:$rs1,
+ AVL:$vl, sew:$sew)), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1525,10 +1496,9 @@ class VPseudoTiedBinaryCarryIn<VReg RetClass,
DAGOperand Op2Class,
LMULInfo MInfo,
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1,
- VMV0:$carry, AVL:$vl, sew:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1,
+ VMV0:$carry, AVL:$vl, sew:$sew), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1544,10 +1514,9 @@ class VPseudoTernaryNoMask<VReg RetClass,
RegisterClass Op1Class,
DAGOperand Op2Class,
string Constraint> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
- AVL:$vl, sew:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
+ AVL:$vl, sew:$sew), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1561,10 +1530,9 @@ class VPseudoTernaryNoMaskWithPolicy<VReg RetClass,
DAGOperand Op2Class,
string Constraint = "",
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
+ AVL:$vl, sew:$sew, vec_policy:$policy), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1580,10 +1548,10 @@ class VPseudoTernaryNoMaskWithPolicyRoundingMode<VReg RetClass,
DAGOperand Op2Class,
string Constraint = "",
bits<2> TargetConstraintType = 1> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
- vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
+ vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy),
+ []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -1600,10 +1568,9 @@ class VPseudoTernaryNoMaskWithPolicyRoundingMode<VReg RetClass,
class VPseudoUSSegLoadNoMask<VReg RetClass,
int EEW,
bits<4> NF> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$vl,
- sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$vl,
+ sew:$sew, vec_policy:$policy), []>,
RISCVVLSEG<NF, /*Masked*/0, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1617,10 +1584,10 @@ class VPseudoUSSegLoadNoMask<VReg RetClass,
class VPseudoUSSegLoadMask<VReg RetClass,
int EEW,
bits<4> NF> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$vl, sew:$sew,
+ vec_policy:$policy), []>,
RISCVVLSEG<NF, /*Masked*/1, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1636,10 +1603,9 @@ class VPseudoUSSegLoadMask<VReg RetClass,
class VPseudoUSSegLoadFFNoMask<VReg RetClass,
int EEW,
bits<4> NF> :
- Pseudo<(outs RetClass:$rd, GPR:$vl),
- (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$avl,
- sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs RetClass:$rd, GPR:$vl),
+ (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$avl,
+ sew:$sew, vec_policy:$policy), []>,
RISCVVLSEG<NF, /*Masked*/0, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1653,10 +1619,10 @@ class VPseudoUSSegLoadFFNoMask<VReg RetClass,
class VPseudoUSSegLoadFFMask<VReg RetClass,
int EEW,
bits<4> NF> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl),
- (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1,
- VMaskOp:$vm, AVL:$avl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$avl, sew:$sew,
+ vec_policy:$policy), []>,
RISCVVLSEG<NF, /*Masked*/1, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1672,10 +1638,9 @@ class VPseudoUSSegLoadFFMask<VReg RetClass,
class VPseudoSSegLoadNoMask<VReg RetClass,
int EEW,
bits<4> NF> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$passthru, GPRMemZeroOffset:$rs1, GPR:$offset, AVL:$vl,
- sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$passthru, GPRMemZeroOffset:$rs1, GPR:$offset,
+ AVL:$vl, sew:$sew, vec_policy:$policy), []>,
RISCVVLSEG<NF, /*Masked*/0, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1689,11 +1654,10 @@ class VPseudoSSegLoadNoMask<VReg RetClass,
class VPseudoSSegLoadMask<VReg RetClass,
int EEW,
bits<4> NF> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1,
- GPR:$offset, VMaskOp:$vm, AVL:$vl, sew:$sew,
- vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ GPRMemZeroOffset:$rs1, GPR:$offset, VMaskOp:$vm,
+ AVL:$vl, sew:$sew, vec_policy:$policy), []>,
RISCVVLSEG<NF, /*Masked*/1, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -1712,10 +1676,10 @@ class VPseudoISegLoadNoMask<VReg RetClass,
bits<3> LMUL,
bits<4> NF,
bit Ordered> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$passthru, GPRMemZeroOffset:$rs1, IdxClass:$offset, AVL:$vl,
- sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$passthru, GPRMemZeroOffset:$rs1,
+ IdxClass:$offset, AVL:$vl, sew:$sew,
+ vec_policy:$policy), []>,
RISCVVLXSEG<NF, /*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 1;
let mayStore = 0;
@@ -1734,11 +1698,10 @@ class VPseudoISegLoadMask<VReg RetClass,
bits<3> LMUL,
bits<4> NF,
bit Ordered> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1,
- IdxClass:$offset, VMaskOp:$vm, AVL:$vl, sew:$sew,
- vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ GPRMemZeroOffset:$rs1, IdxClass:$offset, VMaskOp:$vm,
+ AVL:$vl, sew:$sew, vec_policy:$policy), []>,
RISCVVLXSEG<NF, /*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 1;
let mayStore = 0;
@@ -1756,9 +1719,9 @@ class VPseudoISegLoadMask<VReg RetClass,
class VPseudoUSSegStoreNoMask<VReg ValClass,
int EEW,
bits<4> NF> :
- Pseudo<(outs),
- (ins ValClass:$rd, GPRMemZeroOffset:$rs1, AVL:$vl, sew:$sew), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins ValClass:$rd, GPRMemZeroOffset:$rs1, AVL:$vl, sew:$sew),
+ []>,
RISCVVSSEG<NF, /*Masked*/0, /*Strided*/0, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -1770,10 +1733,9 @@ class VPseudoUSSegStoreNoMask<VReg ValClass,
class VPseudoUSSegStoreMask<VReg ValClass,
int EEW,
bits<4> NF> :
- Pseudo<(outs),
- (ins ValClass:$rd, GPRMemZeroOffset:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins ValClass:$rd, GPRMemZeroOffset:$rs1,
+ VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
RISCVVSSEG<NF, /*Masked*/1, /*Strided*/0, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -1786,10 +1748,9 @@ class VPseudoUSSegStoreMask<VReg ValClass,
class VPseudoSSegStoreNoMask<VReg ValClass,
int EEW,
bits<4> NF> :
- Pseudo<(outs),
- (ins ValClass:$rd, GPRMemZeroOffset:$rs1, GPR:$offset,
- AVL:$vl, sew:$sew), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins ValClass:$rd, GPRMemZeroOffset:$rs1, GPR:$offset,
+ AVL:$vl, sew:$sew), []>,
RISCVVSSEG<NF, /*Masked*/0, /*Strided*/1, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -1801,10 +1762,9 @@ class VPseudoSSegStoreNoMask<VReg ValClass,
class VPseudoSSegStoreMask<VReg ValClass,
int EEW,
bits<4> NF> :
- Pseudo<(outs),
- (ins ValClass:$rd, GPRMemZeroOffset:$rs1, GPR: $offset,
- VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins ValClass:$rd, GPRMemZeroOffset:$rs1, GPR: $offset,
+ VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
RISCVVSSEG<NF, /*Masked*/1, /*Strided*/1, !logtwo(EEW), VLMul> {
let mayLoad = 0;
let mayStore = 1;
@@ -1820,10 +1780,9 @@ class VPseudoISegStoreNoMask<VReg ValClass,
bits<3> LMUL,
bits<4> NF,
bit Ordered> :
- Pseudo<(outs),
- (ins ValClass:$rd, GPRMemZeroOffset:$rs1, IdxClass: $index,
- AVL:$vl, sew:$sew), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins ValClass:$rd, GPRMemZeroOffset:$rs1, IdxClass: $index,
+ AVL:$vl, sew:$sew), []>,
RISCVVSXSEG<NF, /*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 0;
let mayStore = 1;
@@ -1838,10 +1797,9 @@ class VPseudoISegStoreMask<VReg ValClass,
bits<3> LMUL,
bits<4> NF,
bit Ordered> :
- Pseudo<(outs),
- (ins ValClass:$rd, GPRMemZeroOffset:$rs1, IdxClass: $index,
- VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs),
+ (ins ValClass:$rd, GPRMemZeroOffset:$rs1, IdxClass: $index,
+ VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
RISCVVSXSEG<NF, /*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> {
let mayLoad = 0;
let mayStore = 1;
@@ -6745,16 +6703,14 @@ let Predicates = [HasVInstructions] in {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
let HasSEWOp = 1, BaseInstr = VMV_X_S in
def PseudoVMV_X_S:
- Pseudo<(outs GPR:$rd), (ins VR:$rs2, sew:$sew), []>,
- Sched<[WriteVMovXS, ReadVMovXS]>,
- RISCVVPseudo;
+ RISCVVPseudo<(outs GPR:$rd), (ins VR:$rs2, sew:$sew), []>,
+ Sched<[WriteVMovXS, ReadVMovXS]>;
let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X, isReMaterializable = 1,
Constraints = "$rd = $passthru" in
- def PseudoVMV_S_X: Pseudo<(outs VR:$rd),
+ def PseudoVMV_S_X: RISCVVPseudo<(outs VR:$rd),
(ins VR:$passthru, GPR:$rs1, AVL:$vl, sew:$sew),
[]>,
- Sched<[WriteVMovSX, ReadVMovSX_V, ReadVMovSX_X]>,
- RISCVVPseudo;
+ Sched<[WriteVMovSX, ReadVMovSX_V, ReadVMovSX_X]>;
}
} // Predicates = [HasVInstructions]
@@ -6767,18 +6723,16 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
foreach f = FPList in {
let HasSEWOp = 1, BaseInstr = VFMV_F_S in
def "PseudoVFMV_" # f.FX # "_S" :
- Pseudo<(outs f.fprclass:$rd),
+ RISCVVPseudo<(outs f.fprclass:$rd),
(ins VR:$rs2, sew:$sew), []>,
- Sched<[WriteVMovFS, ReadVMovFS]>,
- RISCVVPseudo;
+ Sched<[WriteVMovFS, ReadVMovFS]>;
let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F, isReMaterializable = 1,
Constraints = "$rd = $passthru" in
def "PseudoVFMV_S_" # f.FX :
- Pseudo<(outs VR:$rd),
+ RISCVVPseudo<(outs VR:$rd),
(ins VR:$passthru, f.fprclass:$rs1, AVL:$vl, sew:$sew),
[]>,
- Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>,
- RISCVVPseudo;
+ Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>;
}
}
} // Predicates = [HasVInstructionsAnyF]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
index 5220815..1bb67f4 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
@@ -448,11 +448,10 @@ class NDSRVInstVLN<bits<5> funct5, string opcodestr>
}
class VPseudoVLN8NoMask<VReg RetClass, bit U> :
- Pseudo<(outs RetClass:$rd),
- (ins RetClass:$dest,
- GPRMemZeroOffset:$rs1,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs RetClass:$rd),
+ (ins RetClass:$dest,
+ GPRMemZeroOffset:$rs1,
+ AVL:$vl, sew:$sew, vec_policy:$policy), []>,
RISCVNDSVLN</*Masked*/0, /*Unsigned*/U, !logtwo(8), VLMul> {
let mayLoad = 1;
let mayStore = 0;
@@ -464,11 +463,11 @@ class VPseudoVLN8NoMask<VReg RetClass, bit U> :
}
class VPseudoVLN8Mask<VReg RetClass, bit U> :
- Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
- (ins GetVRegNoV0<RetClass>.R:$passthru,
- GPRMemZeroOffset:$rs1,
- VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo,
+ RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$passthru,
+ GPRMemZeroOffset:$rs1,
+ VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy),
+ []>,
RISCVNDSVLN</*Masked*/1, /*Unsigned*/U, !logtwo(8), VLMul> {
let mayLoad = 1;
let mayStore = 0;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
index 3912eb0..ebcf079 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
@@ -154,18 +154,17 @@ foreach m = MxList in {
let VLMul = m.value in {
let BaseInstr = RI_VEXTRACT in
def PseudoRI_VEXTRACT_ # mx :
- Pseudo<(outs GPR:$rd), (ins m.vrclass:$rs2, uimm5:$idx, ixlenimm:$sew),
- []>,
- RISCVVPseudo;
+ RISCVVPseudo<(outs GPR:$rd),
+ (ins m.vrclass:$rs2, uimm5:$idx, ixlenimm:$sew),
+ []>;
let HasVLOp = 1, BaseInstr = RI_VINSERT, HasVecPolicyOp = 1,
Constraints = "$rd = $rs1" in
def PseudoRI_VINSERT_ # mx :
- Pseudo<(outs m.vrclass:$rd),
- (ins m.vrclass:$rs1, GPR:$rs2, uimm5:$idx, AVL:$vl,
- ixlenimm:$sew, ixlenimm:$policy),
- []>,
- RISCVVPseudo;
+ RISCVVPseudo<(outs m.vrclass:$rd),
+ (ins m.vrclass:$rs1, GPR:$rs2, uimm5:$idx, AVL:$vl,
+ ixlenimm:$sew, ixlenimm:$policy),
+ []>;
}
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index 17fb75e..a47dfe3 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -243,10 +243,9 @@ let Predicates = [HasVendorXSfvfnrclipxfqf], DecoderNamespace = "XSfvector",
}
class VPseudoVC_X<Operand OpClass, DAGOperand RS1Class> :
- Pseudo<(outs),
- (ins OpClass:$op1, payload5:$rs2, payload5:$rd, RS1Class:$r1,
- AVL:$vl, sew:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs),
+ (ins OpClass:$op1, payload5:$rs2, payload5:$rd, RS1Class:$r1,
+ AVL:$vl, sew:$sew), []> {
let mayLoad = 0;
let mayStore = 0;
let HasVLOp = 1;
@@ -255,10 +254,9 @@ class VPseudoVC_X<Operand OpClass, DAGOperand RS1Class> :
}
class VPseudoVC_XV<Operand OpClass, VReg RS2Class, DAGOperand RS1Class> :
- Pseudo<(outs),
- (ins OpClass:$op1, payload5:$rd, RS2Class:$rs2, RS1Class:$r1,
- AVL:$vl, sew:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs),
+ (ins OpClass:$op1, payload5:$rd, RS2Class:$rs2, RS1Class:$r1,
+ AVL:$vl, sew:$sew), []> {
let mayLoad = 0;
let mayStore = 0;
let HasVLOp = 1;
@@ -268,10 +266,9 @@ class VPseudoVC_XV<Operand OpClass, VReg RS2Class, DAGOperand RS1Class> :
class VPseudoVC_XVV<Operand OpClass, VReg RDClass, VReg RS2Class,
DAGOperand RS1Class> :
- Pseudo<(outs),
- (ins OpClass:$op1, RDClass:$rd, RS2Class:$rs2, RS1Class:$r1,
- AVL:$vl, sew:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs),
+ (ins OpClass:$op1, RDClass:$rd, RS2Class:$rs2, RS1Class:$r1,
+ AVL:$vl, sew:$sew), []> {
let mayLoad = 0;
let mayStore = 0;
let HasVLOp = 1;
@@ -280,10 +277,9 @@ class VPseudoVC_XVV<Operand OpClass, VReg RDClass, VReg RS2Class,
}
class VPseudoVC_V_X<Operand OpClass, VReg RDClass, DAGOperand RS1Class> :
- Pseudo<(outs RDClass:$rd),
- (ins OpClass:$op1, payload5:$rs2, RS1Class:$r1,
- AVL:$vl, sew:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RDClass:$rd),
+ (ins OpClass:$op1, payload5:$rs2, RS1Class:$r1,
+ AVL:$vl, sew:$sew), []> {
let mayLoad = 0;
let mayStore = 0;
let HasVLOp = 1;
@@ -293,10 +289,9 @@ class VPseudoVC_V_X<Operand OpClass, VReg RDClass, DAGOperand RS1Class> :
class VPseudoVC_V_XV<Operand OpClass, VReg RDClass, VReg RS2Class,
DAGOperand RS1Class> :
- Pseudo<(outs RDClass:$rd),
- (ins OpClass:$op1, RS2Class:$rs2, RS1Class:$r1,
- AVL:$vl, sew:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RDClass:$rd),
+ (ins OpClass:$op1, RS2Class:$rs2, RS1Class:$r1,
+ AVL:$vl, sew:$sew), []> {
let mayLoad = 0;
let mayStore = 0;
let HasVLOp = 1;
@@ -306,10 +301,9 @@ class VPseudoVC_V_XV<Operand OpClass, VReg RDClass, VReg RS2Class,
class VPseudoVC_V_XVV<Operand OpClass, VReg RDClass, VReg RS2Class,
DAGOperand RS1Class> :
- Pseudo<(outs RDClass:$rd),
- (ins OpClass:$op1, RDClass:$rs3, RS2Class:$rs2, RS1Class:$r1,
- AVL:$vl, sew:$sew), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RDClass:$rd),
+ (ins OpClass:$op1, RDClass:$rs3, RS2Class:$rs2, RS1Class:$r1,
+ AVL:$vl, sew:$sew), []> {
let mayLoad = 0;
let mayStore = 0;
let HasVLOp = 1;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index 4147c97..a250ac8 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -230,9 +230,8 @@ class ZvkMxSet<string vd_lmul> {
}
class VPseudoBinaryNoMask_Zvk<DAGOperand RetClass, VReg OpClass> :
- Pseudo<(outs RetClass:$rd_wb),
- (ins RetClass:$rd, OpClass:$rs2, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ RISCVVPseudo<(outs RetClass:$rd_wb),
+ (ins RetClass:$rd, OpClass:$rs2, AVL:$vl, sew:$sew, vec_policy:$policy), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -246,10 +245,9 @@ class VPseudoBinaryNoMask_Zvk<DAGOperand RetClass, VReg OpClass> :
class VPseudoTernaryNoMask_Zvk<VReg RetClass,
VReg Op1Class,
DAGOperand Op2Class> :
- Pseudo<(outs RetClass:$rd_wb),
+ RISCVVPseudo<(outs RetClass:$rd_wb),
(ins RetClass:$rd, Op1Class:$rs2, Op2Class:$rs1,
- AVL:$vl, sew:$sew, vec_policy:$policy), []>,
- RISCVVPseudo {
+ AVL:$vl, sew:$sew, vec_policy:$policy), []> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index 6de870c..0565fcd 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -131,10 +131,14 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy,
: Constant::getAllOnesValue(XLenTy);
return true;
}
- if (auto *VPLdSt = dyn_cast<VPIntrinsic>(I)) {
- assert((VPLdSt->getIntrinsicID() == Intrinsic::vp_load ||
- VPLdSt->getIntrinsicID() == Intrinsic::vp_store) &&
- "Unexpected intrinsic");
+
+ auto *II = cast<IntrinsicInst>(I);
+ switch (II->getIntrinsicID()) {
+ default:
+ llvm_unreachable("Unsupported intrinsic type");
+ case Intrinsic::vp_load:
+ case Intrinsic::vp_store: {
+ auto *VPLdSt = cast<VPIntrinsic>(I);
Ptr = VPLdSt->getMemoryPointerParam();
Alignment = VPLdSt->getPointerAlignment().value_or(
DL.getABITypeAlign(VTy->getElementType()));
@@ -151,21 +155,32 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy,
VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
return true;
}
- auto *II = cast<IntrinsicInst>(I);
- assert(II->getIntrinsicID() == Intrinsic::masked_load &&
- "Unexpected intrinsic");
- Ptr = II->getOperand(0);
- Alignment = cast<ConstantInt>(II->getArgOperand(1))->getAlignValue();
+ case Intrinsic::masked_load: {
+ Ptr = II->getOperand(0);
+ Alignment = cast<ConstantInt>(II->getArgOperand(1))->getAlignValue();
- if (!isa<UndefValue>(II->getOperand(3)))
- return false;
+ if (!isa<UndefValue>(II->getOperand(3)))
+ return false;
- assert(Mask && "masked.load needs a mask!");
+ assert(Mask && "masked.load needs a mask!");
- VL = isa<FixedVectorType>(VTy)
- ? Builder.CreateElementCount(XLenTy, VTy->getElementCount())
- : Constant::getAllOnesValue(XLenTy);
- return true;
+ VL = isa<FixedVectorType>(VTy)
+ ? Builder.CreateElementCount(XLenTy, VTy->getElementCount())
+ : Constant::getAllOnesValue(XLenTy);
+ return true;
+ }
+ case Intrinsic::masked_store: {
+ Ptr = II->getOperand(1);
+ Alignment = cast<ConstantInt>(II->getArgOperand(2))->getAlignValue();
+
+ assert(Mask && "masked.store needs a mask!");
+
+ VL = isa<FixedVectorType>(VTy)
+ ? Builder.CreateElementCount(XLenTy, VTy->getElementCount())
+ : Constant::getAllOnesValue(XLenTy);
+ return true;
+ }
+ }
}
/// Lower an interleaved load into a vlsegN intrinsic.
@@ -189,7 +204,7 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
const DataLayout &DL = Load->getDataLayout();
auto *VTy = cast<FixedVectorType>(Shuffles[0]->getType());
- auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
+ auto *XLenTy = Builder.getIntNTy(Subtarget.getXLen());
Value *Ptr, *VL;
Align Alignment;
@@ -217,6 +232,7 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
{VTy, BasePtr->getType(), Stride->getType()},
{BasePtr, Stride, Mask, VL});
+ Alignment = commonAlignment(Alignment, Indices[0] * ScalarSizeInBytes);
CI->addParamAttr(0,
Attribute::getWithAlignment(CI->getContext(), Alignment));
Shuffles[0]->replaceAllUsesWith(CI);
@@ -250,22 +266,28 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
///
/// Note that the new shufflevectors will be removed and we'll only generate one
/// vsseg3 instruction in CodeGen.
-bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store,
+ Value *LaneMask,
ShuffleVectorInst *SVI,
unsigned Factor) const {
- IRBuilder<> Builder(SI);
- const DataLayout &DL = SI->getDataLayout();
+ IRBuilder<> Builder(Store);
+ const DataLayout &DL = Store->getDataLayout();
auto Mask = SVI->getShuffleMask();
auto *ShuffleVTy = cast<FixedVectorType>(SVI->getType());
// Given SVI : <n*factor x ty>, then VTy : <n x ty>
auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(),
ShuffleVTy->getNumElements() / Factor);
- if (!isLegalInterleavedAccessType(VTy, Factor, SI->getAlign(),
- SI->getPointerAddressSpace(), DL))
+ auto *XLenTy = Builder.getIntNTy(Subtarget.getXLen());
+
+ Value *Ptr, *VL;
+ Align Alignment;
+ if (!getMemOperands(Factor, VTy, XLenTy, Store, Ptr, LaneMask, VL, Alignment))
return false;
- auto *PtrTy = SI->getPointerOperandType();
- auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
+ Type *PtrTy = Ptr->getType();
+ unsigned AS = PtrTy->getPointerAddressSpace();
+ if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
+ return false;
unsigned Index;
// If the segment store only has one active lane (i.e. the interleave is
@@ -276,26 +298,27 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
unsigned ScalarSizeInBytes =
DL.getTypeStoreSize(ShuffleVTy->getElementType());
Value *Data = SVI->getOperand(0);
- auto *DataVTy = cast<FixedVectorType>(Data->getType());
+ Data = Builder.CreateExtractVector(VTy, Data, uint64_t(0));
Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes);
- Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset);
- Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount());
- Value *VL = Builder.CreateElementCount(Builder.getInt32Ty(),
- VTy->getElementCount());
-
- CallInst *CI = Builder.CreateIntrinsic(
- Intrinsic::experimental_vp_strided_store,
- {Data->getType(), BasePtr->getType(), Stride->getType()},
- {Data, BasePtr, Stride, Mask, VL});
- CI->addParamAttr(
- 1, Attribute::getWithAlignment(CI->getContext(), SI->getAlign()));
+ Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
+ // Note: Same VL as above, but i32 not xlen due to signature of
+ // vp.strided.store
+ VL = Builder.CreateElementCount(Builder.getInt32Ty(),
+ VTy->getElementCount());
+ CallInst *CI =
+ Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store,
+ {VTy, BasePtr->getType(), Stride->getType()},
+ {Data, BasePtr, Stride, LaneMask, VL});
+ Alignment = commonAlignment(Alignment, Index * ScalarSizeInBytes);
+ CI->addParamAttr(1,
+ Attribute::getWithAlignment(CI->getContext(), Alignment));
return true;
}
Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
- SI->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy});
+ Store->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy});
SmallVector<Value *, 10> Ops;
SmallVector<int, 16> NewShuffleMask;
@@ -311,13 +334,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
NewShuffleMask.clear();
}
- // This VL should be OK (should be executable in one vsseg instruction,
- // potentially under larger LMULs) because we checked that the fixed vector
- // type fits in isLegalInterleavedAccessType
- Value *VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount());
- Value *StoreMask = Builder.getAllOnesMask(VTy->getElementCount());
- Ops.append({SI->getPointerOperand(), StoreMask, VL});
-
+ Ops.append({Ptr, LaneMask, VL});
Builder.CreateCall(VssegNFunc, Ops);
return true;
@@ -334,7 +351,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
VectorType *ResVTy = getDeinterleavedVectorType(DI);
const DataLayout &DL = Load->getDataLayout();
- auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
+ auto *XLenTy = Builder.getIntNTy(Subtarget.getXLen());
Value *Ptr, *VL;
Align Alignment;
@@ -355,8 +372,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
unsigned NumElts = ResVTy->getElementCount().getKnownMinValue();
Type *VecTupTy = TargetExtType::get(
Load->getContext(), "riscv.vector.tuple",
- ScalableVectorType::get(Type::getInt8Ty(Load->getContext()),
- NumElts * SEW / 8),
+ ScalableVectorType::get(Builder.getInt8Ty(), NumElts * SEW / 8),
Factor);
Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
Load->getModule(), ScalableVlsegIntrIds[Factor - 2],
@@ -397,7 +413,7 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
auto *InVTy = cast<VectorType>(InterleaveValues[0]->getType());
const DataLayout &DL = Store->getDataLayout();
- Type *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
+ Type *XLenTy = Builder.getIntNTy(Subtarget.getXLen());
Value *Ptr, *VL;
Align Alignment;
@@ -421,9 +437,7 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
unsigned NumElts = InVTy->getElementCount().getKnownMinValue();
Type *VecTupTy = TargetExtType::get(
Store->getContext(), "riscv.vector.tuple",
- ScalableVectorType::get(Type::getInt8Ty(Store->getContext()),
- NumElts * SEW / 8),
- Factor);
+ ScalableVectorType::get(Builder.getInt8Ty(), NumElts * SEW / 8), Factor);
Value *StoredVal = PoisonValue::get(VecTupTy);
for (unsigned i = 0; i < Factor; ++i)
@@ -440,91 +454,3 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
Builder.CreateCall(VssegNFunc, Operands);
return true;
}
-
-/// Lower an interleaved vp.store into a vssegN intrinsic.
-///
-/// E.g. Lower an interleaved vp.store (Factor = 2):
-///
-/// %is = tail call <vscale x 64 x i8>
-/// @llvm.vector.interleave2.nxv64i8(
-/// <vscale x 32 x i8> %load0,
-/// <vscale x 32 x i8> %load1
-/// %wide.rvl = shl nuw nsw i32 %rvl, 1
-/// tail call void @llvm.vp.store.nxv64i8.p0(
-/// <vscale x 64 x i8> %is, ptr %ptr,
-/// %mask,
-/// i32 %wide.rvl)
-///
-/// Into:
-/// call void @llvm.riscv.vsseg2.mask.nxv32i8.i64(
-/// <vscale x 32 x i8> %load1,
-/// <vscale x 32 x i8> %load2, ptr %ptr,
-/// %mask,
-/// i64 %rvl)
-bool RISCVTargetLowering::lowerInterleavedVPStore(
- VPIntrinsic *Store, Value *Mask,
- ArrayRef<Value *> InterleaveOperands) const {
- assert(Mask && "Expect a valid mask");
- assert(Store->getIntrinsicID() == Intrinsic::vp_store &&
- "Unexpected intrinsic");
-
- const unsigned Factor = InterleaveOperands.size();
-
- auto *VTy = dyn_cast<VectorType>(InterleaveOperands[0]->getType());
- if (!VTy)
- return false;
-
- const DataLayout &DL = Store->getDataLayout();
- Align Alignment = Store->getParamAlign(1).value_or(
- DL.getABITypeAlign(VTy->getElementType()));
- if (!isLegalInterleavedAccessType(
- VTy, Factor, Alignment,
- Store->getArgOperand(1)->getType()->getPointerAddressSpace(), DL))
- return false;
-
- IRBuilder<> Builder(Store);
- Value *WideEVL = Store->getArgOperand(3);
- // Conservatively check if EVL is a multiple of factor, otherwise some
- // (trailing) elements might be lost after the transformation.
- if (!isMultipleOfN(WideEVL, Store->getDataLayout(), Factor))
- return false;
-
- auto *PtrTy = Store->getArgOperand(1)->getType();
- auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
- auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
- Value *EVL =
- Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
-
- if (isa<FixedVectorType>(VTy)) {
- SmallVector<Value *, 8> Operands(InterleaveOperands);
- Operands.append({Store->getArgOperand(1), Mask, EVL});
- Builder.CreateIntrinsic(FixedVssegIntrIds[Factor - 2],
- {VTy, PtrTy, XLenTy}, Operands);
- return true;
- }
-
- unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
- unsigned NumElts = VTy->getElementCount().getKnownMinValue();
- Type *VecTupTy = TargetExtType::get(
- Store->getContext(), "riscv.vector.tuple",
- ScalableVectorType::get(Type::getInt8Ty(Store->getContext()),
- NumElts * SEW / 8),
- Factor);
-
- Function *VecInsertFunc = Intrinsic::getOrInsertDeclaration(
- Store->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, VTy});
- Value *StoredVal = PoisonValue::get(VecTupTy);
- for (unsigned i = 0; i < Factor; ++i)
- StoredVal = Builder.CreateCall(
- VecInsertFunc, {StoredVal, InterleaveOperands[i], Builder.getInt32(i)});
-
- Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
- Store->getModule(), ScalableVssegIntrIds[Factor - 2],
- {VecTupTy, PtrTy, Mask->getType(), EVL->getType()});
-
- Value *Operands[] = {StoredVal, Store->getArgOperand(1), Mask, EVL,
- ConstantInt::get(XLenTy, Log2_64(SEW))};
-
- Builder.CreateCall(VssegNFunc, Operands);
- return true;
-}
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
index 3e286a7..bf23812 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
@@ -24,6 +24,67 @@ class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0
bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
}
+defvar SMX60VLEN = 256;
+defvar SMX60DLEN = !div(SMX60VLEN, 2);
+
+class Get1248Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M2") : 2,
+ !eq(mx, "M4") : 4,
+ !eq(mx, "M8") : 8,
+ true: 1
+ );
+}
+
+// Used for: logical opsz, shifts, sign ext, merge/move, FP sign/recip/convert, mask ops, slides
+class Get4816Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M4") : 8,
+ !eq(mx, "M8") : 16,
+ true: 4
+ );
+}
+
+// Used for: arithmetic (add/sub/min/max), saturating/averaging, FP add/sub/min/max
+class Get458Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M4") : 5,
+ !eq(mx, "M8") : 8,
+ true: 4
+ );
+}
+
+// Widening scaling pattern (4,4,4,4,5,8,8): plateaus at higher LMULs
+// Used for: widening operations
+class Get4588Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M2") : 5,
+ !eq(mx, "M4") : 8,
+ !eq(mx, "M8") : 8, // M8 not supported for most widening, fallback
+ true: 4
+ );
+}
+
+// Used for: mask-producing comparisons, carry ops with mask, FP comparisons
+class Get461018Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M2") : 6,
+ !eq(mx, "M4") : 10,
+ !eq(mx, "M8") : 18,
+ true: 4
+ );
+}
+
+// Used for: e64 multiply pattern, complex ops
+class Get781632Latency<string mx> {
+ int c = !cond(
+ !eq(mx, "M2") : 8,
+ !eq(mx, "M4") : 16,
+ !eq(mx, "M8") : 32,
+ true: 7
+ );
+}
+
def SpacemitX60Model : SchedMachineModel {
let IssueWidth = 2; // dual-issue
let MicroOpBufferSize = 0; // in-order
@@ -322,58 +383,96 @@ foreach LMul = [1, 2, 4, 8] in {
foreach mx = SchedMxList in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
- defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>;
-
- defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>;
-
- defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+ let Latency = Get458Latency<mx>.c, ReleaseAtCycles = [4] in {
+ defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
+ }
+
+ let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [4] in {
+ // Pattern of vadd, vsub, vrsub: 4/4/5/8
+ // Pattern of vand, vor, vxor: 4/4/8/16
+ // They are grouped together, so we used the worst case 4/4/8/16
+ // TODO: use InstRW to override individual instructions' scheduling data
+ defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+
+ defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>;
+ }
+
+ let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [4] in {
+ defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>;
+ }
+
+ // Pattern of vmacc, vmadd, vmul, vmulh, etc.: e8/e16 = 4/4/5/8, e32 = 5,5,5,8,
+ // e64 = 7,8,16,32. We use the worst-case until we can split the SEW.
+ // TODO: change WriteVIMulV, etc to be defined with LMULSEWSchedWrites
+ let Latency = Get781632Latency<mx>.c, ReleaseAtCycles = [7] in {
+ defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+ }
}
// Widening
+// Pattern of vwmul, vwmacc, etc: e8/e16 = 4/4/5/8, e32 = 5,5,5,8
+// We use the worst-case for all.
foreach mx = SchedMxListW in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
- defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+ let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [4] in {
+ defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+ }
}
-// Vector Integer Division and Remainder
+// Division and remainder operations
+// Pattern of vdivu: 11/11/11/20/40/80/160
+// Pattern of vdiv: 12/12/12/22/44/88/176
+// Pattern of vremu: 12/12/12/22/44/88/176
+// Pattern of vrem: 13/13/13/24/48/96/192
+// We use for all: 12/12/12/24/48/96/192
+// TODO: Create separate WriteVIRem to more closely match the latencies
foreach mx = SchedMxList in {
foreach sew = SchedSEWSet<mx>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
- defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ // Slightly reduced for fractional LMULs
+ defvar Multiplier = !cond(
+ !eq(mx, "MF8") : 12,
+ !eq(mx, "MF4") : 12,
+ !eq(mx, "MF2") : 12,
+ true: 24
+ );
+
+ let Latency = !mul(Get1248Latency<mx>.c, Multiplier), ReleaseAtCycles = [12] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
+ }
}
}
@@ -381,12 +480,21 @@ foreach mx = SchedMxList in {
foreach mx = SchedMxListW in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
- defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>;
+ // Slightly increased for integer LMULs
+ defvar Multiplier = !cond(
+ !eq(mx, "M2") : 2,
+ !eq(mx, "M4") : 2,
+ true: 1
+ );
+
+ let Latency = !mul(Get4816Latency<mx>.c, Multiplier), ReleaseAtCycles = [4] in {
+ defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>;
+ }
}
// 12. Vector Fixed-Point Arithmetic Instructions
diff --git a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp
index 668e596..6ecddad 100644
--- a/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSelectionDAGInfo.cpp
@@ -24,6 +24,18 @@ void RISCVSelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG,
switch (N->getOpcode()) {
default:
return SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N);
+ case RISCVISD::TUPLE_EXTRACT:
+ assert(N->getNumOperands() == 2 && "Expected three operands!");
+ assert(N->getOperand(1).getOpcode() == ISD::TargetConstant &&
+ N->getOperand(1).getValueType() == MVT::i32 &&
+ "Expected index to be an i32 target constant!");
+ break;
+ case RISCVISD::TUPLE_INSERT:
+ assert(N->getNumOperands() == 3 && "Expected three operands!");
+ assert(N->getOperand(2).getOpcode() == ISD::TargetConstant &&
+ N->getOperand(2).getValueType() == MVT::i32 &&
+ "Expected index to be an i32 target constant!");
+ break;
case RISCVISD::VQDOT_VL:
case RISCVISD::VQDOTU_VL:
case RISCVISD::VQDOTSU_VL: {
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index c754de4..e35ffaf 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -216,7 +216,7 @@ unsigned RISCVSubtarget::getMinimumJumpTableEntries() const {
}
void RISCVSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const {
+ const SchedRegion &Region) const {
// Do bidirectional scheduling since it provides a more balanced scheduling
// leading to better performance. This will increase compile time.
Policy.OnlyTopDown = false;
@@ -231,8 +231,8 @@ void RISCVSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
Policy.ShouldTrackPressure = true;
}
-void RISCVSubtarget::overridePostRASchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const {
+void RISCVSubtarget::overridePostRASchedPolicy(
+ MachineSchedPolicy &Policy, const SchedRegion &Region) const {
MISched::Direction PostRASchedDirection = getPostRASchedDirection();
if (PostRASchedDirection == MISched::TopDown) {
Policy.OnlyTopDown = true;
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 4f560cc..fd57e02 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -395,11 +395,11 @@ public:
}
void overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const override;
+ const SchedRegion &Region) const override;
void overridePostRASchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const override;
+ const SchedRegion &Region) const override;
};
-} // End llvm namespace
+} // namespace llvm
#endif
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 56ead92..fd634b5 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1489,6 +1489,34 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
0, cast<VectorType>(ICA.getReturnType()));
}
+ case Intrinsic::fptoui_sat:
+ case Intrinsic::fptosi_sat: {
+ InstructionCost Cost = 0;
+ bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
+ Type *SrcTy = ICA.getArgTypes()[0];
+
+ auto SrcLT = getTypeLegalizationCost(SrcTy);
+ auto DstLT = getTypeLegalizationCost(RetTy);
+ if (!SrcTy->isVectorTy())
+ break;
+
+ if (!SrcLT.first.isValid() || !DstLT.first.isValid())
+ return InstructionCost::getInvalid();
+
+ Cost +=
+ getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
+ RetTy, SrcTy, TTI::CastContextHint::None, CostKind);
+
+ // Handle NaN.
+ // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1.
+ // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.
+ Type *CondTy = RetTy->getWithNewBitWidth(1);
+ Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,
+ CmpInst::FCMP_UNO, CostKind);
+ Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
+ CmpInst::FCMP_UNO, CostKind);
+ return Cost;
+ }
}
if (ST->hasVInstructions() && RetTy->isVectorTy()) {
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 12bf8c1..d62d99c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -116,8 +116,8 @@ public:
}
TailFoldingStyle
getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const override {
- return ST->hasVInstructions() ? TailFoldingStyle::Data
- : TailFoldingStyle::DataWithoutLaneMask;
+ return ST->hasVInstructions() ? TailFoldingStyle::DataWithEVL
+ : TailFoldingStyle::None;
}
std::optional<unsigned> getMaxVScale() const override;
std::optional<unsigned> getVScaleForTuning() const override;
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 15bd346..c946451 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -114,14 +114,6 @@ FunctionPass *llvm::createRISCVVLOptimizerPass() {
return new RISCVVLOptimizer();
}
-/// Return true if R is a physical or virtual vector register, false otherwise.
-static bool isVectorRegClass(Register R, const MachineRegisterInfo *MRI) {
- if (R.isPhysical())
- return RISCV::VRRegClass.contains(R);
- const TargetRegisterClass *RC = MRI->getRegClass(R);
- return RISCVRI::isVRegClass(RC->TSFlags);
-}
-
LLVM_ATTRIBUTE_UNUSED
static raw_ostream &operator<<(raw_ostream &OS, const OperandInfo &OI) {
OI.print(OS);
@@ -183,37 +175,28 @@ static unsigned getIntegerExtensionOperandEEW(unsigned Factor,
return Log2EEW;
}
-/// Check whether MO is a mask operand of MI.
-static bool isMaskOperand(const MachineInstr &MI, const MachineOperand &MO,
- const MachineRegisterInfo *MRI) {
-
- if (!MO.isReg() || !isVectorRegClass(MO.getReg(), MRI))
- return false;
-
- const MCInstrDesc &Desc = MI.getDesc();
- return Desc.operands()[MO.getOperandNo()].RegClass == RISCV::VMV0RegClassID;
-}
-
static std::optional<unsigned>
getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
const MachineInstr &MI = *MO.getParent();
+ const MCInstrDesc &Desc = MI.getDesc();
const RISCVVPseudosTable::PseudoInfo *RVV =
RISCVVPseudosTable::getPseudoInfo(MI.getOpcode());
assert(RVV && "Could not find MI in PseudoTable");
// MI has a SEW associated with it. The RVV specification defines
// the EEW of each operand and definition in relation to MI.SEW.
- unsigned MILog2SEW =
- MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
+ unsigned MILog2SEW = MI.getOperand(RISCVII::getSEWOpNum(Desc)).getImm();
- const bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(MI.getDesc());
- const bool IsTied = RISCVII::isTiedPseudo(MI.getDesc().TSFlags);
+ const bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(Desc);
+ const bool IsTied = RISCVII::isTiedPseudo(Desc.TSFlags);
bool IsMODef = MO.getOperandNo() == 0 ||
(HasPassthru && MO.getOperandNo() == MI.getNumExplicitDefs());
// All mask operands have EEW=1
- if (isMaskOperand(MI, MO, MRI))
+ const MCOperandInfo &Info = Desc.operands()[MO.getOperandNo()];
+ if (Info.OperandType == MCOI::OPERAND_REGISTER &&
+ Info.RegClass == RISCV::VMV0RegClassID)
return 0;
// switch against BaseInstr to reduce number of cases that need to be
@@ -1296,8 +1279,8 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
TII->get(RISCV::getRVVMCOpcode(MI.getOpcode())).TSFlags) &&
"Instruction shouldn't be supported if elements depend on VL");
- assert(MI.getOperand(0).isReg() &&
- isVectorRegClass(MI.getOperand(0).getReg(), MRI) &&
+ assert(RISCVRI::isVRegClass(
+ MRI->getRegClass(MI.getOperand(0).getReg())->TSFlags) &&
"All supported instructions produce a vector register result");
LLVM_DEBUG(dbgs() << "Found a candidate for VL reduction: " << MI << "\n");
@@ -1486,7 +1469,6 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const {
}
bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) {
- assert(DemandedVLs.size() == 0);
if (skipFunction(MF.getFunction()))
return false;
@@ -1499,6 +1481,8 @@ bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) {
TII = ST.getInstrInfo();
+ assert(DemandedVLs.empty());
+
// For each instruction that defines a vector, compute what VL its
// downstream users demand.
for (MachineBasicBlock *MBB : post_order(&MF)) {
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index 84ef539..c1cc19b 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -434,6 +434,15 @@ bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) {
if (!isKnownSameDefs(TrueMask.getReg(), MIMask.getReg()))
return false;
+ // Masked off lanes past TrueVL will come from False, and converting to vmv
+ // will lose these lanes unless MIVL <= TrueVL.
+ // TODO: We could relax this for False == Passthru and True policy == TU
+ const MachineOperand &MIVL = MI.getOperand(RISCVII::getVLOpNum(MI.getDesc()));
+ const MachineOperand &TrueVL =
+ True->getOperand(RISCVII::getVLOpNum(True->getDesc()));
+ if (!RISCV::isVLKnownLE(MIVL, TrueVL))
+ return false;
+
// True's passthru needs to be equivalent to False
Register TruePassthruReg = True->getOperand(1).getReg();
Register FalseReg = MI.getOperand(2).getReg();
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index b90e1aa..3c631ce 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -665,10 +665,10 @@ Type *SPIRVEmitIntrinsics::deduceElementTypeHelper(
auto *HandleType = cast<TargetExtType>(II->getOperand(0)->getType());
if (HandleType->getTargetExtName() == "spirv.Image" ||
HandleType->getTargetExtName() == "spirv.SignedImage") {
- if (II->hasOneUse()) {
- auto *U = *II->users().begin();
+ for (User *U : II->users()) {
Ty = cast<Instruction>(U)->getAccessType();
- assert(Ty && "Unable to get type for resource pointer.");
+ if (Ty)
+ break;
}
} else if (HandleType->getTargetExtName() == "spirv.VulkanBuffer") {
// This call is supposed to index into an array
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index 6766bd8..595424b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -410,6 +410,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
II, Intrinsic::SPVIntrinsics::spv_lifetime_start, {1});
} else {
II->eraseFromParent();
+ Changed = true;
}
break;
case Intrinsic::lifetime_end:
@@ -418,6 +419,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
II, Intrinsic::SPVIntrinsics::spv_lifetime_end, {1});
} else {
II->eraseFromParent();
+ Changed = true;
}
break;
case Intrinsic::ptr_annotation:
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
index 4a9c88b..a95c4ff 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -7,7 +7,6 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/SparcFixupKinds.h"
-#include "MCTargetDesc/SparcMCAsmInfo.h"
#include "MCTargetDesc/SparcMCTargetDesc.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCELFObjectWriter.h"
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index 1ee6e80..79da53e 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -13,10 +13,7 @@
#include "MCTargetDesc/SparcMCAsmInfo.h"
#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCObjectStreamer.h"
-#include "llvm/MC/MCValue.h"
using namespace llvm;
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 9b434d8..1aa8efe 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -2201,7 +2201,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
SDValue Chain = DAG.getEntryNode();
SDValue InGlue;
- Chain = DAG.getCALLSEQ_START(Chain, 1, 0, DL);
+ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
Chain = DAG.getCopyToReg(Chain, DL, SP::O0, Argument, InGlue);
InGlue = Chain.getValue(1);
SDValue Callee = DAG.getTargetExternalSymbol("__tls_get_addr", PtrVT);
@@ -2219,7 +2219,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
InGlue};
Chain = DAG.getNode(SPISD::TLS_CALL, DL, NodeTys, Ops);
InGlue = Chain.getValue(1);
- Chain = DAG.getCALLSEQ_END(Chain, 1, 0, InGlue, DL);
+ Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
InGlue = Chain.getValue(1);
SDValue Ret = DAG.getCopyFromReg(Chain, DL, SP::O0, PtrVT, InGlue);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index 2662241e..e6486e2 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -256,9 +256,17 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
// Precompute the set of registers that are unused, so that we can insert
// drops to their defs.
+ // And unstackify any stackified registers that don't have any uses, so that
+ // they can be dropped later. This can happen when transformations after
+ // RegStackify remove instructions using stackified registers.
BitVector UseEmpty(MRI.getNumVirtRegs());
- for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I)
- UseEmpty[I] = MRI.use_empty(Register::index2VirtReg(I));
+ for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) {
+ Register Reg = Register::index2VirtReg(I);
+ if (MRI.use_empty(Reg)) {
+ UseEmpty[I] = true;
+ MFI.unstackifyVReg(Reg);
+ }
+ }
// Visit each instruction in the function.
for (MachineBasicBlock &MBB : MF) {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index ac819cf..b03b350 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -15,12 +15,14 @@
#include "WebAssembly.h"
#include "WebAssemblyISelLowering.h"
#include "WebAssemblyTargetMachine.h"
+#include "WebAssemblyUtilities.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/WasmEHFuncInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h" // To access function attributes.
#include "llvm/IR/IntrinsicsWebAssembly.h"
+#include "llvm/MC/MCSymbolWasm.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/raw_ostream.h"
@@ -118,6 +120,51 @@ static SDValue getTagSymNode(int Tag, SelectionDAG *DAG) {
return DAG->getTargetExternalSymbol(SymName, PtrVT);
}
+static APInt encodeFunctionSignature(SelectionDAG *DAG, SDLoc &DL,
+ SmallVector<MVT, 4> &Returns,
+ SmallVector<MVT, 4> &Params) {
+ auto toWasmValType = [](MVT VT) {
+ if (VT == MVT::i32) {
+ return wasm::ValType::I32;
+ }
+ if (VT == MVT::i64) {
+ return wasm::ValType::I64;
+ }
+ if (VT == MVT::f32) {
+ return wasm::ValType::F32;
+ }
+ if (VT == MVT::f64) {
+ return wasm::ValType::F64;
+ }
+ LLVM_DEBUG(errs() << "Unhandled type for llvm.wasm.ref.test.func: " << VT
+ << "\n");
+ llvm_unreachable("Unhandled type for llvm.wasm.ref.test.func");
+ };
+ auto NParams = Params.size();
+ auto NReturns = Returns.size();
+ auto BitWidth = (NParams + NReturns + 2) * 64;
+ auto Sig = APInt(BitWidth, 0);
+
+ // Annoying special case: if getSignificantBits() <= 64 then InstrEmitter will
+ // emit an Imm instead of a CImm. It simplifies WebAssemblyMCInstLower if we
+ // always emit a CImm. So xor NParams with 0x7ffffff to ensure
+ // getSignificantBits() > 64
+ Sig |= NReturns ^ 0x7ffffff;
+ for (auto &Return : Returns) {
+ auto V = toWasmValType(Return);
+ Sig <<= 64;
+ Sig |= (int64_t)V;
+ }
+ Sig <<= 64;
+ Sig |= NParams;
+ for (auto &Param : Params) {
+ auto V = toWasmValType(Param);
+ Sig <<= 64;
+ Sig |= (int64_t)V;
+ }
+ return Sig;
+}
+
void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
// If we have a custom node, we already have selected!
if (Node->isMachineOpcode()) {
@@ -189,6 +236,58 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
ReplaceNode(Node, TLSAlign);
return;
}
+ case Intrinsic::wasm_ref_test_func: {
+ // First emit the TABLE_GET instruction to convert function pointer ==>
+ // funcref
+ MachineFunction &MF = CurDAG->getMachineFunction();
+ auto PtrVT = MVT::getIntegerVT(MF.getDataLayout().getPointerSizeInBits());
+ MCSymbol *Table = WebAssembly::getOrCreateFunctionTableSymbol(
+ MF.getContext(), Subtarget);
+ SDValue TableSym = CurDAG->getMCSymbol(Table, PtrVT);
+ SDValue FuncPtr = Node->getOperand(1);
+ if (Subtarget->hasAddr64() && FuncPtr.getValueType() == MVT::i64) {
+ // table.get expects an i32 but on 64 bit platforms the function pointer
+ // is an i64. In that case, i32.wrap_i64 to convert.
+ FuncPtr = SDValue(CurDAG->getMachineNode(WebAssembly::I32_WRAP_I64, DL,
+ MVT::i32, FuncPtr),
+ 0);
+ }
+ SDValue FuncRef =
+ SDValue(CurDAG->getMachineNode(WebAssembly::TABLE_GET_FUNCREF, DL,
+ MVT::funcref, TableSym, FuncPtr),
+ 0);
+
+ // Encode the signature information into the type index placeholder.
+ // This gets decoded and converted into the actual type signature in
+ // WebAssemblyMCInstLower.cpp.
+ SmallVector<MVT, 4> Params;
+ SmallVector<MVT, 4> Returns;
+
+ bool IsParam = false;
+ // Operand 0 is the return register, Operand 1 is the function pointer.
+ // The remaining operands encode the type of the function we are testing
+ // for.
+ for (unsigned I = 2, E = Node->getNumOperands(); I < E; ++I) {
+ MVT VT = Node->getOperand(I).getValueType().getSimpleVT();
+ if (VT == MVT::Untyped) {
+ IsParam = true;
+ continue;
+ }
+ if (IsParam) {
+ Params.push_back(VT);
+ } else {
+ Returns.push_back(VT);
+ }
+ }
+ auto Sig = encodeFunctionSignature(CurDAG, DL, Returns, Params);
+
+ auto SigOp = CurDAG->getTargetConstant(
+ Sig, DL, EVT::getIntegerVT(*CurDAG->getContext(), Sig.getBitWidth()));
+ MachineSDNode *RefTestNode = CurDAG->getMachineNode(
+ WebAssembly::REF_TEST_FUNCREF, DL, MVT::i32, {SigOp, FuncRef});
+ ReplaceNode(Node, RefTestNode);
+ return;
+ }
}
break;
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 09b8864..11936a3 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -798,6 +798,7 @@ LowerCallResults(MachineInstr &CallResults, DebugLoc DL, MachineBasicBlock *BB,
if (IsIndirect) {
// Placeholder for the type index.
+ // This gets replaced with the correct value in WebAssemblyMCInstLower.cpp
MIB.addImm(0);
// The table into which this call_indirect indexes.
MCSymbolWasm *Table = IsFuncrefCall
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index cc36244..4613fcb 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -15,13 +15,18 @@
#include "WebAssemblyMCInstLower.h"
#include "MCTargetDesc/WebAssemblyMCAsmInfo.h"
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "MCTargetDesc/WebAssemblyMCTypeUtilities.h"
#include "TargetInfo/WebAssemblyTargetInfo.h"
#include "Utils/WebAssemblyTypeUtilities.h"
#include "WebAssemblyAsmPrinter.h"
#include "WebAssemblyMachineFunctionInfo.h"
#include "WebAssemblyUtilities.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/BinaryFormat/Wasm.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/IR/Constants.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
@@ -152,6 +157,34 @@ MCOperand WebAssemblyMCInstLower::lowerTypeIndexOperand(
return MCOperand::createExpr(Expr);
}
+MCOperand
+WebAssemblyMCInstLower::lowerEncodedFunctionSignature(const APInt &Sig) const {
+ // For APInt a word is 64 bits on all architectures, see definition in APInt.h
+ auto NumWords = Sig.getNumWords();
+ SmallVector<wasm::ValType, 4> Params;
+ SmallVector<wasm::ValType, 2> Returns;
+
+ int Idx = NumWords;
+ auto GetWord = [&Idx, &Sig]() {
+ Idx--;
+ return Sig.extractBitsAsZExtValue(64, 64 * Idx);
+ };
+ // Annoying special case: if getSignificantBits() <= 64 then InstrEmitter will
+ // emit an Imm instead of a CImm. It simplifies WebAssemblyMCInstLower if we
+ // always emit a CImm. So xor NParams with 0x7ffffff to ensure
+ // getSignificantBits() > 64
+ // See encodeFunctionSignature in WebAssemblyISelDAGtoDAG.cpp
+ int NReturns = GetWord() ^ 0x7ffffff;
+ for (int I = 0; I < NReturns; I++) {
+ Returns.push_back(static_cast<wasm::ValType>(GetWord()));
+ }
+ int NParams = GetWord();
+ for (int I = 0; I < NParams; I++) {
+ Params.push_back(static_cast<wasm::ValType>(GetWord()));
+ }
+ return lowerTypeIndexOperand(std::move(Returns), std::move(Params));
+}
+
static void getFunctionReturns(const MachineInstr *MI,
SmallVectorImpl<wasm::ValType> &Returns) {
const Function &F = MI->getMF()->getFunction();
@@ -196,11 +229,30 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
MCOp = MCOperand::createReg(WAReg);
break;
}
+ case llvm::MachineOperand::MO_CImmediate: {
+ // Lower type index placeholder for ref.test
+ // Currently this is the only way that CImmediates show up so panic if we
+ // get confused.
+ unsigned DescIndex = I - NumVariadicDefs;
+ assert(DescIndex < Desc.NumOperands && "unexpected CImmediate operand");
+ auto Operands = Desc.operands();
+ const MCOperandInfo &Info = Operands[DescIndex];
+ assert(Info.OperandType == WebAssembly::OPERAND_TYPEINDEX &&
+ "unexpected CImmediate operand");
+ (void)Info;
+ MCOp = lowerEncodedFunctionSignature(MO.getCImm()->getValue());
+ break;
+ }
case MachineOperand::MO_Immediate: {
unsigned DescIndex = I - NumVariadicDefs;
if (DescIndex < Desc.NumOperands) {
- const MCOperandInfo &Info = Desc.operands()[DescIndex];
+ auto Operands = Desc.operands();
+ const MCOperandInfo &Info = Operands[DescIndex];
+ // Replace type index placeholder with actual type index. The type index
+ // placeholders are Immediates and have an operand type of
+ // OPERAND_TYPEINDEX or OPERAND_SIGNATURE.
if (Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) {
+ // Lower type index placeholder for a CALL_INDIRECT instruction
SmallVector<wasm::ValType, 4> Returns;
SmallVector<wasm::ValType, 4> Params;
@@ -228,6 +280,7 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
break;
}
if (Info.OperandType == WebAssembly::OPERAND_SIGNATURE) {
+ // Lower type index placeholder for blocks
auto BT = static_cast<WebAssembly::BlockType>(MO.getImm());
assert(BT != WebAssembly::BlockType::Invalid);
if (BT == WebAssembly::BlockType::Multivalue) {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
index 9f08499..34404d9 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
@@ -36,6 +36,7 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower {
MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
MCOperand lowerTypeIndexOperand(SmallVectorImpl<wasm::ValType> &&,
SmallVectorImpl<wasm::ValType> &&) const;
+ MCOperand lowerEncodedFunctionSignature(const APInt &Sig) const;
public:
WebAssemblyMCInstLower(MCContext &ctx, WebAssemblyAsmPrinter &printer)
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index b642c1c..8213e51 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1042,8 +1042,8 @@ private:
}
PrevState = CurrState;
}
- void onRParen() {
- PrevState = State;
+ bool onRParen(StringRef &ErrMsg) {
+ IntelExprState CurrState = State;
switch (State) {
default:
State = IES_ERROR;
@@ -1054,9 +1054,27 @@ private:
case IES_RBRAC:
case IES_RPAREN:
State = IES_RPAREN;
+ // In the case of a multiply, onRegister has already set IndexReg
+ // directly, with appropriate scale.
+ // Otherwise if we just saw a register it has only been stored in
+ // TmpReg, so we need to store it into the state machine.
+ if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+ // If we already have a BaseReg, then assume this is the IndexReg with
+ // no explicit scale.
+ if (!BaseReg) {
+ BaseReg = TmpReg;
+ } else {
+ if (IndexReg)
+ return regsUseUpError(ErrMsg);
+ IndexReg = TmpReg;
+ Scale = 0;
+ }
+ }
IC.pushOperator(IC_RPAREN);
break;
}
+ PrevState = CurrState;
+ return false;
}
bool onOffset(const MCExpr *Val, SMLoc OffsetLoc, StringRef ID,
const InlineAsmIdentifierInfo &IDInfo,
@@ -2172,7 +2190,11 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
}
break;
case AsmToken::LParen: SM.onLParen(); break;
- case AsmToken::RParen: SM.onRParen(); break;
+ case AsmToken::RParen:
+ if (SM.onRParen(ErrMsg)) {
+ return Error(Tok.getLoc(), ErrMsg);
+ }
+ break;
}
if (SM.hadError())
return Error(Tok.getLoc(), "unknown token in expression");
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index f5eeb3b..d691538 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -11,7 +11,6 @@
//===----------------------------------------------------------------------===//
#include "X86MCAsmInfo.h"
-#include "MCTargetDesc/X86MCExpr.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/Support/CommandLine.h"
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h
index efb951b..e02b556 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -151,6 +151,7 @@ private:
MCSymbol *LazyPointer) override;
void emitCallInstruction(const llvm::MCInst &MCI);
+ void maybeEmitNopAfterCallForWindowsEH(const MachineInstr *MI);
// Emits a label to mark the next instruction as being relevant to Import Call
// Optimization.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6281124..11ab8dc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5001,9 +5001,12 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
EVT VT = Op.getValueType();
unsigned SizeInBits = VT.getSizeInBits();
- assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
unsigned NumElts = SizeInBits / EltSizeInBits;
+ // Can't split constant.
+ if ((SizeInBits % EltSizeInBits) != 0)
+ return false;
+
// Bitcast a source array of element bits to the target size.
auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
unsigned NumSrcElts = UndefSrcElts.getBitWidth();
@@ -45059,6 +45062,10 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
unsigned NumElts = DemandedElts.getBitWidth();
switch (Op.getOpcode()) {
+ case X86ISD::GlobalBaseReg:
+ case X86ISD::Wrapper:
+ case X86ISD::WrapperRIP:
+ return true;
case X86ISD::BLENDI:
case X86ISD::PSHUFD:
case X86ISD::UNPCKL:
@@ -45098,27 +45105,34 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
switch (Op.getOpcode()) {
+ // SSE vector insert/extracts use modulo indices.
+ case X86ISD::PINSRB:
+ case X86ISD::PINSRW:
+ case X86ISD::PEXTRB:
+ case X86ISD::PEXTRW:
+ return false;
// SSE vector multiplies are either inbounds or saturate.
case X86ISD::VPMADDUBSW:
case X86ISD::VPMADDWD:
+ return false;
// SSE vector shifts handle out of bounds shift amounts.
case X86ISD::VSHLI:
case X86ISD::VSRLI:
case X86ISD::VSRAI:
return false;
- // SSE blends.
+ // SSE blends.
case X86ISD::BLENDI:
case X86ISD::BLENDV:
return false;
- // SSE target shuffles.
+ // SSE target shuffles.
case X86ISD::PSHUFD:
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
case X86ISD::VPERMILPI:
case X86ISD::VPERMV3:
return false;
- // SSE comparisons handle all icmp/fcmp cases.
- // TODO: Add CMPM/MM with test coverage.
+ // SSE comparisons handle all icmp/fcmp cases.
+ // TODO: Add CMPM/MM with test coverage.
case X86ISD::CMPP:
case X86ISD::PCMPEQ:
case X86ISD::PCMPGT:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 2636979..547b221 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1668,7 +1668,8 @@ namespace llvm {
/// Lower interleaved store(s) into target specific
/// instructions/intrinsics.
- bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ bool lowerInterleavedStore(Instruction *Store, Value *Mask,
+ ShuffleVectorInst *SVI,
unsigned Factor) const override;
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 360293bc..636b072 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -822,7 +822,8 @@ bool X86TargetLowering::lowerInterleavedLoad(
return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
}
-bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool X86TargetLowering::lowerInterleavedStore(Instruction *Store,
+ Value *LaneMask,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
@@ -832,6 +833,11 @@ bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
0 &&
"Invalid interleaved store");
+ auto *SI = dyn_cast<StoreInst>(Store);
+ if (!SI)
+ return false;
+ assert(!LaneMask && "Unexpected mask on store");
+
// Holds the indices of SVI that correspond to the starting index of each
// interleaved shuffle.
auto Mask = SVI->getShuffleMask();
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 45d596b..481a9be 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -32,6 +32,7 @@
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Mangler.h"
@@ -833,6 +834,7 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
CallInst.setOpcode(CallOpcode);
CallInst.addOperand(CallTargetMCOp);
OutStreamer->emitInstruction(CallInst, getSubtargetInfo());
+ maybeEmitNopAfterCallForWindowsEH(&MI);
}
// Record our statepoint node in the same section used by STACKMAP
@@ -1430,21 +1432,6 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
OutStreamer->emitLabel(FallthroughLabel);
}
-// Returns instruction preceding MBBI in MachineFunction.
-// If MBBI is the first instruction of the first basic block, returns null.
-static MachineBasicBlock::const_iterator
-PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) {
- const MachineBasicBlock *MBB = MBBI->getParent();
- while (MBBI == MBB->begin()) {
- if (MBB == &MBB->getParent()->front())
- return MachineBasicBlock::const_iterator();
- MBB = MBB->getPrevNode();
- MBBI = MBB->end();
- }
- --MBBI;
- return MBBI;
-}
-
static unsigned getSrcIdx(const MachineInstr* MI, unsigned SrcIdx) {
if (X86II::isKMasked(MI->getDesc().TSFlags)) {
// Skip mask operand.
@@ -2271,6 +2258,9 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
OutStreamer->AddComment("EVEX TO EVEX Compression ", false);
}
+ // We use this to suppress NOP padding for Windows EH.
+ bool IsTailJump = false;
+
switch (MI->getOpcode()) {
case TargetOpcode::DBG_VALUE:
llvm_unreachable("Should be handled target independently");
@@ -2325,6 +2315,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
// Lower this as normal, but add a comment.
OutStreamer->AddComment("TAILCALL");
+ IsTailJump = true;
break;
case X86::TAILJMPr:
@@ -2340,6 +2331,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
// Lower these as normal, but add some comments.
OutStreamer->AddComment("TAILCALL");
+ IsTailJump = true;
break;
case X86::TAILJMPm64_REX:
@@ -2349,6 +2341,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
}
OutStreamer->AddComment("TAILCALL");
+ IsTailJump = true;
break;
case X86::TAILJMPr64_REX: {
@@ -2361,6 +2354,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
}
OutStreamer->AddComment("TAILCALL");
+ IsTailJump = true;
break;
}
@@ -2537,26 +2531,6 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
case X86::SEH_BeginEpilogue: {
assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
- // Windows unwinder will not invoke function's exception handler if IP is
- // either in prologue or in epilogue. This behavior causes a problem when a
- // call immediately precedes an epilogue, because the return address points
- // into the epilogue. To cope with that, we insert a 'nop' if it ends up
- // immediately after a CALL in the final emitted code.
- MachineBasicBlock::const_iterator MBBI(MI);
- // Check if preceded by a call and emit nop if so.
- for (MBBI = PrevCrossBBInst(MBBI);
- MBBI != MachineBasicBlock::const_iterator();
- MBBI = PrevCrossBBInst(MBBI)) {
- // Pseudo instructions that aren't a call are assumed to not emit any
- // code. If they do, we worst case generate unnecessary noops after a
- // call.
- if (MBBI->isCall() || !MBBI->isPseudo()) {
- if (MBBI->isCall())
- EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
- break;
- }
- }
-
EmitSEHInstruction(MI);
return;
}
@@ -2585,6 +2559,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
emitCallInstruction(TmpInst);
emitNop(*OutStreamer, 5, Subtarget);
+ maybeEmitNopAfterCallForWindowsEH(MI);
return;
}
@@ -2605,6 +2580,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
// For Import Call Optimization to work, we need a 3-byte nop after the
// call instruction.
emitNop(*OutStreamer, 3, Subtarget);
+ maybeEmitNopAfterCallForWindowsEH(MI);
return;
}
break;
@@ -2638,6 +2614,10 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
if (MI->isCall()) {
emitCallInstruction(TmpInst);
+ // Since tail calls transfer control without leaving a stack frame, there is
+ // never a need for NOP padding tail calls.
+ if (!IsTailJump)
+ maybeEmitNopAfterCallForWindowsEH(MI);
return;
}
@@ -2659,6 +2639,164 @@ void X86AsmPrinter::emitCallInstruction(const llvm::MCInst &MCI) {
OutStreamer->emitInstruction(MCI, getSubtargetInfo());
}
+// Determines whether a NOP is required after a CALL, so that Windows EH
+// IP2State tables have the correct information.
+//
+// On most Windows platforms (AMD64, ARM64, ARM32, IA64, but *not* x86-32),
+// exception handling works by looking up instruction pointers in lookup
+// tables. These lookup tables are stored in .xdata sections in executables.
+// One element of the lookup tables are the "IP2State" tables (Instruction
+// Pointer to State).
+//
+// If a function has any instructions that require cleanup during exception
+// unwinding, then it will have an IP2State table. Each entry in the IP2State
+// table describes a range of bytes in the function's instruction stream, and
+// associates an "EH state number" with that range of instructions. A value of
+// -1 means "the null state", which does not require any code to execute.
+// A value other than -1 is an index into the State table.
+//
+// The entries in the IP2State table contain byte offsets within the instruction
+// stream of the function. The Windows ABI requires that these offsets are
+// aligned to instruction boundaries; they are not permitted to point to a byte
+// that is not the first byte of an instruction.
+//
+// Unfortunately, CALL instructions present a problem during unwinding. CALL
+// instructions push the address of the instruction after the CALL instruction,
+// so that execution can resume after the CALL. If the CALL is the last
+// instruction within an IP2State region, then the return address (on the stack)
+// points to the *next* IP2State region. This means that the unwinder will
+// use the wrong cleanup funclet during unwinding.
+//
+// To fix this problem, the Windows AMD64 ABI requires that CALL instructions
+// are never placed at the end of an IP2State region. Stated equivalently, the
+// end of a CALL instruction cannot be aligned to an IP2State boundary. If a
+// CALL instruction would occur at the end of an IP2State region, then the
+// compiler must insert a NOP instruction after the CALL. The NOP instruction
+// is placed in the same EH region as the CALL instruction, so that the return
+// address points to the NOP and the unwinder will locate the correct region.
+//
+// NOP padding is only necessary on Windows AMD64 targets. On ARM64 and ARM32,
+// instructions have a fixed size so the unwinder knows how to "back up" by
+// one instruction.
+//
+// Interaction with Import Call Optimization (ICO):
+//
+// Import Call Optimization (ICO) is a compiler + OS feature on Windows which
+// improves the performance and security of DLL imports. ICO relies on using a
+// specific CALL idiom that can be replaced by the OS DLL loader. This removes
+// a load and indirect CALL and replaces it with a single direct CALL.
+//
+// To achieve this, ICO also inserts NOPs after the CALL instruction. If the
+// end of the CALL is aligned with an EH state transition, we *also* insert
+// a single-byte NOP. **Both forms of NOPs must be preserved.** They cannot
+// be combined into a single larger NOP; nor can the second NOP be removed.
+//
+// This is necessary because, if ICO is active and the call site is modified
+// by the loader, the loader will end up overwriting the NOPs that were inserted
+// for ICO. That means that those NOPs cannot be used for the correct
+// termination of the exception handling region (the IP2State transition),
+// so we still need an additional NOP instruction. The NOPs cannot be combined
+// into a longer NOP (which is ordinarily desirable) because then ICO would
+// split one instruction, producing a malformed instruction after the ICO call.
+void X86AsmPrinter::maybeEmitNopAfterCallForWindowsEH(const MachineInstr *MI) {
+ // We only need to insert NOPs after CALLs when targeting Windows on AMD64.
+ // (Don't let the name fool you: Itanium refers to table-based exception
+ // handling, not the Itanium architecture.)
+ if (MAI->getExceptionHandlingType() != ExceptionHandling::WinEH ||
+ MAI->getWinEHEncodingType() != WinEH::EncodingType::Itanium) {
+ return;
+ }
+
+ bool HasEHPersonality = MF->getWinEHFuncInfo() != nullptr;
+
+ // Set up MBB iterator, initially positioned on the same MBB as MI.
+ MachineFunction::const_iterator MFI(MI->getParent());
+ MachineFunction::const_iterator MFE(MF->end());
+
+ // Set up instruction iterator, positioned immediately *after* MI.
+ MachineBasicBlock::const_iterator MBBI(MI);
+ MachineBasicBlock::const_iterator MBBE = MI->getParent()->end();
+ ++MBBI; // Step over MI
+
+ // This loop iterates MBBs
+ for (;;) {
+ // This loop iterates instructions
+ for (; MBBI != MBBE; ++MBBI) {
+ // Check the instruction that follows this CALL.
+ const MachineInstr &NextMI = *MBBI;
+
+ // If there is an EH_LABEL after this CALL, then there is an EH state
+ // transition after this CALL. This is exactly the situation which
+ // requires NOP padding.
+ if (NextMI.isEHLabel()) {
+ if (HasEHPersonality) {
+ EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+ return;
+ }
+ // We actually want to continue, in case there is an SEH_BeginEpilogue
+ // instruction after the EH_LABEL. In some situations, IR is produced
+ // that contains EH_LABEL pseudo-instructions, even when we are not
+ // generating IP2State tables. We still need to insert a NOP before
+ // SEH_BeginEpilogue in that case.
+ continue;
+ }
+
+ // Somewhat similarly, if the CALL is the last instruction before the
+ // SEH prologue, then we also need a NOP. This is necessary because the
+ // Windows stack unwinder will not invoke a function's exception handler
+ // if the instruction pointer is in the function prologue or epilogue.
+ //
+ // We always emit a NOP before SEH_BeginEpilogue, even if there is no
+ // personality function (unwind info) for this frame. This is the same
+ // behavior as MSVC.
+ if (NextMI.getOpcode() == X86::SEH_BeginEpilogue) {
+ EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+ return;
+ }
+
+ if (!NextMI.isPseudo() && !NextMI.isMetaInstruction()) {
+ // We found a real instruction. During the CALL, the return IP will
+ // point to this instruction. Since this instruction has the same EH
+ // state as the call itself (because there is no intervening EH_LABEL),
+ // the IP2State table will be accurate; there is no need to insert a
+ // NOP.
+ return;
+ }
+
+ // The next instruction is a pseudo-op. Ignore it and keep searching.
+ // Because these instructions do not generate any machine code, they
+ // cannot prevent the IP2State table from pointing at the wrong
+ // instruction during a CALL.
+ }
+
+ // We've reached the end of this MBB. Find the next MBB in program order.
+ // MBB order should be finalized by this point, so falling across MBBs is
+ // expected.
+ ++MFI;
+ if (MFI == MFE) {
+ // No more blocks; we've reached the end of the function. This should
+ // only happen with no-return functions, but double-check to be sure.
+ if (HasEHPersonality) {
+ // If the CALL has no successors, then it is a noreturn function.
+ // Insert an INT3 instead of a NOP. This accomplishes the same purpose,
+ // but is more clear to read. Also, analysis tools will understand
+ // that they should not continue disassembling after the CALL (unless
+ // there are other branches to that label).
+ if (MI->getParent()->succ_empty())
+ EmitAndCountInstruction(MCInstBuilder(X86::INT3));
+ else
+ EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+ }
+ return;
+ }
+
+ // Set up iterator to scan the next basic block.
+ const MachineBasicBlock *NextMBB = &*MFI;
+ MBBI = NextMBB->instr_begin();
+ MBBE = NextMBB->instr_end();
+ }
+}
+
void X86AsmPrinter::emitLabelAndRecordForImportCallOptimization(
ImportCallKind Kind) {
assert(EnableImportCallOptimization);