aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp8
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td6
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td6
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp180
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h19
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp61
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h9
-rw-r--r--llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp47
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h3
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp29
-rw-r--r--llvm/lib/Target/Mips/MipsFastISel.cpp7
-rw-r--r--llvm/lib/Target/Mips/MipsISelLowering.cpp7
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp6
-rw-r--r--llvm/lib/Target/RISCV/RISCVFeatures.td3
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp5
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td28
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td51
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp20
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp4
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp10
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp32
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td4
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td28
-rw-r--r--llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp2
-rw-r--r--llvm/lib/Target/X86/X86.td7
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp60
-rw-r--r--llvm/lib/Target/X86/X86ISelLoweringCall.cpp20
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.cpp22
-rw-r--r--llvm/lib/Target/X86/X86MCInstLower.cpp31
33 files changed, 508 insertions, 217 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 91c1f59..662d84b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18176,8 +18176,8 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
- if (Factor != 2 && Factor != 4) {
- LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
+ if (Factor != 2 && Factor != 3 && Factor != 4) {
+ LLVM_DEBUG(dbgs() << "Matching ld2, ld3 and ld4 patterns failed\n");
return false;
}
auto *LI = dyn_cast<LoadInst>(Load);
@@ -18255,8 +18255,8 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
Instruction *Store, Value *Mask,
ArrayRef<Value *> InterleavedValues) const {
unsigned Factor = InterleavedValues.size();
- if (Factor != 2 && Factor != 4) {
- LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
+ if (Factor != 2 && Factor != 3 && Factor != 4) {
+ LLVM_DEBUG(dbgs() << "Matching st2, st3 and st4 patterns failed\n");
return false;
}
StoreInst *SI = dyn_cast<StoreInst>(Store);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 30dfcf2b..12c600f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -10600,6 +10600,9 @@ describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
Register DestReg = DestSrc->Destination->getReg();
Register SrcReg = DestSrc->Source->getReg();
+ if (!DestReg.isValid() || !SrcReg.isValid())
+ return std::nullopt;
+
auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
// If the described register is the destination, just return the source.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 1b559a6..8ed4062 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -514,8 +514,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MVT::i64, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
- setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
- Legal);
+ setOperationAction({ISD::ABS, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX},
+ MVT::i32, Legal);
setOperationAction(
{ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index bfe2c80..a67b12a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -901,6 +901,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});
+ addRulesForGOpcs({G_READSTEADYCOUNTER}, Standard).Uni(S64, {{Sgpr64}, {}});
+
bool hasSALUFloat = ST->hasSALUFloatInsts();
addRulesForGOpcs({G_FADD}, Standard)
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index d0ad120..b841171 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1488,6 +1488,12 @@ let AssemblerPredicate = isGFX12Plus in {
def : MnemonicAlias<"ds_load_tr_b64", "ds_load_tr8_b64">, Requires<[isGFX1250Plus]>;
def : MnemonicAlias<"ds_load_tr_b128", "ds_load_tr16_b128">, Requires<[isGFX1250Plus]>;
+// Additional aliases for ds load transpose instructions.
+def : MnemonicAlias<"ds_load_b64_tr_b8", "ds_load_tr8_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"ds_load_b128_tr_b16", "ds_load_tr16_b128">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"ds_load_b64_tr_b4", "ds_load_tr4_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"ds_load_b96_tr_b6", "ds_load_tr6_b96">, Requires<[isGFX125xOnly]>;
+
//===----------------------------------------------------------------------===//
// GFX11.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index e0375ea..e3f3aba 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -892,6 +892,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// have EXEC as implicit destination. Issue a warning if encoding for
// vdst is not EXEC.
if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3) &&
+ MCII->get(MI.getOpcode()).getNumDefs() == 0 &&
MCII->get(MI.getOpcode()).hasImplicitDefOfPhysReg(AMDGPU::EXEC)) {
auto ExecEncoding = MRI.getEncodingValue(AMDGPU::EXEC_LO);
if (Bytes_[0] != ExecEncoding)
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 6de59be..8ea64d1 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -3711,6 +3711,12 @@ defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "globa
defm GLOBAL_LOAD_TR4_B64 : VFLAT_Real_AllAddr_gfx1250<0x073>;
defm GLOBAL_LOAD_TR6_B96 : VFLAT_Real_AllAddr_gfx1250<0x074>;
+// Additional aliases for global load transpose instructions.
+def : MnemonicAlias<"global_load_b128_tr_b16", "global_load_tr16_b128">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"global_load_b64_tr_b8", "global_load_tr8_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"global_load_b64_tr_b4", "global_load_tr4_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"global_load_b96_tr_b6", "global_load_tr6_b96">, Requires<[isGFX125xOnly]>;
+
defm FLAT_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>;
defm FLAT_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "flat_atomic_min_num_f64">;
defm FLAT_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "flat_atomic_max_num_f64">;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 58482ea..9fbf9e5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -69,6 +69,12 @@ static cl::opt<bool> GCNTrackers(
cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
cl::init(false));
+static cl::opt<unsigned> PendingQueueLimit(
+ "amdgpu-scheduler-pending-queue-limit", cl::Hidden,
+ cl::desc(
+ "Max (Available+Pending) size to inspect pending queue (0 disables)"),
+ cl::init(256));
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
#define DUMP_MAX_REG_PRESSURE
static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler(
@@ -335,17 +341,52 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
}
}
+static bool shouldCheckPending(SchedBoundary &Zone,
+ const TargetSchedModel *SchedModel) {
+ bool HasBufferedModel =
+ SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize();
+ unsigned Combined = Zone.Available.size() + Zone.Pending.size();
+ return Combined <= PendingQueueLimit && HasBufferedModel;
+}
+
+static SUnit *pickOnlyChoice(SchedBoundary &Zone,
+ const TargetSchedModel *SchedModel) {
+ // pickOnlyChoice() releases pending instructions and checks for new hazards.
+ SUnit *OnlyChoice = Zone.pickOnlyChoice();
+ if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty())
+ return OnlyChoice;
+
+ return nullptr;
+}
+
+void GCNSchedStrategy::printCandidateDecision(const SchedCandidate &Current,
+ const SchedCandidate &Preferred) {
+ LLVM_DEBUG({
+ dbgs() << "Prefer:\t\t";
+ DAG->dumpNode(*Preferred.SU);
+
+ if (Current.SU) {
+ dbgs() << "Not:\t";
+ DAG->dumpNode(*Current.SU);
+ }
+
+ dbgs() << "Reason:\t\t";
+ traceCandidate(Preferred);
+ });
+}
+
// This function is mostly cut and pasted from
// GenericScheduler::pickNodeFromQueue()
void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
const CandPolicy &ZonePolicy,
const RegPressureTracker &RPTracker,
- SchedCandidate &Cand,
+ SchedCandidate &Cand, bool &IsPending,
bool IsBottomUp) {
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
unsigned SGPRPressure = 0;
unsigned VGPRPressure = 0;
+ IsPending = false;
if (DAG->isTrackingPressure()) {
if (!GCNTrackers) {
SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
@@ -358,8 +399,9 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
VGPRPressure = T->getPressure().getArchVGPRNum();
}
}
- ReadyQueue &Q = Zone.Available;
- for (SUnit *SU : Q) {
+ LLVM_DEBUG(dbgs() << "Available Q:\n");
+ ReadyQueue &AQ = Zone.Available;
+ for (SUnit *SU : AQ) {
SchedCandidate TryCand(ZonePolicy);
initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
@@ -371,27 +413,55 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
// Initialize resource delta if needed in case future heuristics query it.
if (TryCand.ResDelta == SchedResourceDelta())
TryCand.initResourceDelta(Zone.DAG, SchedModel);
+ LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
Cand.setBest(TryCand);
- LLVM_DEBUG(traceCandidate(Cand));
+ } else {
+ printCandidateDecision(TryCand, Cand);
+ }
+ }
+
+ if (!shouldCheckPending(Zone, SchedModel))
+ return;
+
+ LLVM_DEBUG(dbgs() << "Pending Q:\n");
+ ReadyQueue &PQ = Zone.Pending;
+ for (SUnit *SU : PQ) {
+
+ SchedCandidate TryCand(ZonePolicy);
+ initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
+ VGPRPressure, IsBottomUp);
+ // Pass SchedBoundary only when comparing nodes from the same boundary.
+ SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
+ tryPendingCandidate(Cand, TryCand, ZoneArg);
+ if (TryCand.Reason != NoCand) {
+ // Initialize resource delta if needed in case future heuristics query it.
+ if (TryCand.ResDelta == SchedResourceDelta())
+ TryCand.initResourceDelta(Zone.DAG, SchedModel);
+ LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
+ IsPending = true;
+ Cand.setBest(TryCand);
+ } else {
+ printCandidateDecision(TryCand, Cand);
}
}
}
// This function is mostly cut and pasted from
// GenericScheduler::pickNodeBidirectional()
-SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
+SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode,
+ bool &PickedPending) {
// Schedule as far as possible in the direction of no choice. This is most
// efficient, but also provides the best heuristics for CriticalPSets.
- if (SUnit *SU = Bot.pickOnlyChoice()) {
+ if (SUnit *SU = pickOnlyChoice(Bot, SchedModel)) {
IsTopNode = false;
return SU;
}
- if (SUnit *SU = Top.pickOnlyChoice()) {
+ if (SUnit *SU = pickOnlyChoice(Top, SchedModel)) {
IsTopNode = true;
return SU;
}
- // Set the bottom-up policy based on the state of the current bottom zone and
- // the instructions outside the zone, including the top zone.
+ // Set the bottom-up policy based on the state of the current bottom zone
+ // and the instructions outside the zone, including the top zone.
CandPolicy BotPolicy;
setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
// Set the top-down policy based on the state of the current top zone and
@@ -399,12 +469,14 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
CandPolicy TopPolicy;
setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
+ bool BotPending = false;
// See if BotCand is still valid (because we previously scheduled from Top).
LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
if (!BotCand.isValid() || BotCand.SU->isScheduled ||
BotCand.Policy != BotPolicy) {
BotCand.reset(CandPolicy());
pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
+ BotPending,
/*IsBottomUp=*/true);
assert(BotCand.Reason != NoCand && "failed to find the first candidate");
} else {
@@ -414,6 +486,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
SchedCandidate TCand;
TCand.reset(CandPolicy());
pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
+ BotPending,
/*IsBottomUp=*/true);
assert(TCand.SU == BotCand.SU &&
"Last pick result should correspond to re-picking right now");
@@ -421,12 +494,14 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
#endif
}
+ bool TopPending = false;
// Check if the top Q has a better candidate.
LLVM_DEBUG(dbgs() << "Picking from Top:\n");
if (!TopCand.isValid() || TopCand.SU->isScheduled ||
TopCand.Policy != TopPolicy) {
TopCand.reset(CandPolicy());
pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
+ TopPending,
/*IsBottomUp=*/false);
assert(TopCand.Reason != NoCand && "failed to find the first candidate");
} else {
@@ -436,6 +511,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
SchedCandidate TCand;
TCand.reset(CandPolicy());
pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
+ TopPending,
/*IsBottomUp=*/false);
assert(TCand.SU == TopCand.SU &&
"Last pick result should correspond to re-picking right now");
@@ -446,12 +522,21 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
// Pick best from BotCand and TopCand.
LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
dbgs() << "Bot Cand: "; traceCandidate(BotCand););
- SchedCandidate Cand = BotCand;
- TopCand.Reason = NoCand;
- tryCandidate(Cand, TopCand, nullptr);
- if (TopCand.Reason != NoCand) {
- Cand.setBest(TopCand);
+ SchedCandidate Cand = BotPending ? TopCand : BotCand;
+ SchedCandidate TryCand = BotPending ? BotCand : TopCand;
+ PickedPending = BotPending && TopPending;
+
+ TryCand.Reason = NoCand;
+ if (BotPending || TopPending) {
+ PickedPending |= tryPendingCandidate(Cand, TopCand, nullptr);
+ } else {
+ tryCandidate(Cand, TryCand, nullptr);
+ }
+
+ if (TryCand.Reason != NoCand) {
+ Cand.setBest(TryCand);
}
+
LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
IsTopNode = Cand.AtTop;
@@ -466,35 +551,55 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
return nullptr;
}
+ bool PickedPending;
SUnit *SU;
do {
+ PickedPending = false;
if (RegionPolicy.OnlyTopDown) {
- SU = Top.pickOnlyChoice();
+ SU = pickOnlyChoice(Top, SchedModel);
if (!SU) {
CandPolicy NoPolicy;
TopCand.reset(NoPolicy);
pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
+ PickedPending,
/*IsBottomUp=*/false);
assert(TopCand.Reason != NoCand && "failed to find a candidate");
SU = TopCand.SU;
}
IsTopNode = true;
} else if (RegionPolicy.OnlyBottomUp) {
- SU = Bot.pickOnlyChoice();
+ SU = pickOnlyChoice(Bot, SchedModel);
if (!SU) {
CandPolicy NoPolicy;
BotCand.reset(NoPolicy);
pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
+ PickedPending,
/*IsBottomUp=*/true);
assert(BotCand.Reason != NoCand && "failed to find a candidate");
SU = BotCand.SU;
}
IsTopNode = false;
} else {
- SU = pickNodeBidirectional(IsTopNode);
+ SU = pickNodeBidirectional(IsTopNode, PickedPending);
}
} while (SU->isScheduled);
+ if (PickedPending) {
+ unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle;
+ SchedBoundary &Zone = IsTopNode ? Top : Bot;
+ unsigned CurrentCycle = Zone.getCurrCycle();
+ if (ReadyCycle > CurrentCycle)
+ Zone.bumpCycle(ReadyCycle);
+
+ // FIXME: checkHazard() doesn't give information about which cycle the
+ // hazard will resolve so just keep bumping the cycle by 1. This could be
+ // made more efficient if checkHazard() returned more details.
+ while (Zone.checkHazard(SU))
+ Zone.bumpCycle(Zone.getCurrCycle() + 1);
+
+ Zone.releasePending();
+ }
+
if (SU->isTopReady())
Top.removeReady(SU);
if (SU->isBottomReady())
@@ -540,6 +645,47 @@ GCNSchedStageID GCNSchedStrategy::getNextStage() const {
return *std::next(CurrentStage);
}
+bool GCNSchedStrategy::tryPendingCandidate(SchedCandidate &Cand,
+ SchedCandidate &TryCand,
+ SchedBoundary *Zone) const {
+ // Initialize the candidate if needed.
+ if (!Cand.isValid()) {
+ TryCand.Reason = NodeOrder;
+ return true;
+ }
+
+ // Bias PhysReg Defs and copies to their uses and defined respectively.
+ if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
+ biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
+ return TryCand.Reason != NoCand;
+
+ // Avoid exceeding the target's limit.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
+ RegExcess, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
+
+ // Avoid increasing the max critical pressure in the scheduled region.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
+ TryCand, Cand, RegCritical, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
+
+ bool SameBoundary = Zone != nullptr;
+ if (SameBoundary) {
+ TryCand.initResourceDelta(DAG, SchedModel);
+ if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+ TryCand, Cand, ResourceReduce))
+ return TryCand.Reason != NoCand;
+ if (tryGreater(TryCand.ResDelta.DemandedResources,
+ Cand.ResDelta.DemandedResources, TryCand, Cand,
+ ResourceDemand))
+ return TryCand.Reason != NoCand;
+ }
+
+ return false;
+}
+
GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
const MachineSchedContext *C, bool IsLegacyScheduler)
: GCNSchedStrategy(C) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 8ea4267..975781f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -44,17 +44,32 @@ raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID);
/// heuristics to determine excess/critical pressure sets.
class GCNSchedStrategy : public GenericScheduler {
protected:
- SUnit *pickNodeBidirectional(bool &IsTopNode);
+ SUnit *pickNodeBidirectional(bool &IsTopNode, bool &PickedPending);
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
const RegPressureTracker &RPTracker,
- SchedCandidate &Cand, bool IsBottomUp);
+ SchedCandidate &Cand, bool &IsPending,
+ bool IsBottomUp);
void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop,
const RegPressureTracker &RPTracker,
const SIRegisterInfo *SRI, unsigned SGPRPressure,
unsigned VGPRPressure, bool IsBottomUp);
+ /// Evaluates instructions in the pending queue using a subset of scheduling
+ /// heuristics.
+ ///
+ /// Instructions that cannot be issued due to hardware constraints are placed
+ /// in the pending queue rather than the available queue, making them normally
+ /// invisible to scheduling heuristics. However, in certain scenarios (such as
+ /// avoiding register spilling), it may be beneficial to consider scheduling
+ /// these not-yet-ready instructions.
+ bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+ SchedBoundary *Zone) const;
+
+ void printCandidateDecision(const SchedCandidate &Current,
+ const SchedCandidate &Preferred);
+
std::vector<unsigned> Pressure;
std::vector<unsigned> MaxPressure;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d516330..50447f4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -9072,6 +9072,67 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
MachineOperand &Src1 = Inst.getOperand(2);
const DebugLoc &DL = Inst.getDebugLoc();
+ if (ST.useRealTrue16Insts()) {
+ Register SrcReg0, SrcReg1;
+ if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
+ SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg0).add(Src0);
+ } else {
+ SrcReg0 = Src0.getReg();
+ }
+
+ if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
+ SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg1).add(Src1);
+ } else {
+ SrcReg1 = Src1.getReg();
+ }
+
+ bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
+ bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
+
+ auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
+ switch (Inst.getOpcode()) {
+ case AMDGPU::S_PACK_LL_B32_B16:
+ NewMI
+ .addReg(SrcReg0, 0,
+ isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+ .addImm(AMDGPU::lo16)
+ .addReg(SrcReg1, 0,
+ isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+ .addImm(AMDGPU::hi16);
+ break;
+ case AMDGPU::S_PACK_LH_B32_B16:
+ NewMI
+ .addReg(SrcReg0, 0,
+ isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+ .addImm(AMDGPU::lo16)
+ .addReg(SrcReg1, 0, AMDGPU::hi16)
+ .addImm(AMDGPU::hi16);
+ break;
+ case AMDGPU::S_PACK_HL_B32_B16:
+ NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
+ .addImm(AMDGPU::lo16)
+ .addReg(SrcReg1, 0,
+ isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+ .addImm(AMDGPU::hi16);
+ break;
+ case AMDGPU::S_PACK_HH_B32_B16:
+ NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
+ .addImm(AMDGPU::lo16)
+ .addReg(SrcReg1, 0, AMDGPU::hi16)
+ .addImm(AMDGPU::hi16);
+ break;
+ default:
+ llvm_unreachable("unhandled s_pack_* instruction");
+ }
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MRI.replaceRegWith(Dest.getReg(), ResultReg);
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+ return;
+ }
+
switch (Inst.getOpcode()) {
case AMDGPU::S_PACK_LL_B32_B16: {
Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e979eeb..df27ec1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -879,6 +879,11 @@ public:
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
}
+ bool isMFMA(uint16_t Opcode) const {
+ return isMAI(Opcode) && Opcode != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+ Opcode != AMDGPU::V_ACCVGPR_READ_B32_e64;
+ }
+
static bool isDOT(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::IsDOT;
}
@@ -895,6 +900,10 @@ public:
return isMFMA(MI) || isWMMA(MI) || isSWMMAC(MI);
}
+ bool isMFMAorWMMA(uint16_t Opcode) const {
+ return isMFMA(Opcode) || isWMMA(Opcode) || isSWMMAC(Opcode);
+ }
+
static bool isSWMMAC(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::IsSWMMAC;
}
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 01a40c1..7431e11 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -47,9 +47,6 @@ private:
const MachineBasicBlock &From,
const MachineBasicBlock &To) const;
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
- // Check if the machine instruction being processed is a supported packed
- // instruction.
- bool isUnpackingSupportedInstr(MachineInstr &MI) const;
// Creates a list of packed instructions following an MFMA that are suitable
// for unpacking.
void collectUnpackingCandidates(MachineInstr &BeginMI,
@@ -454,23 +451,6 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
return true;
}
-// If support is extended to new operations, add tests in
-// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir.
-bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
- if (!TII->isNeverCoissue(MI))
- return false;
- unsigned Opcode = MI.getOpcode();
- switch (Opcode) {
- case AMDGPU::V_PK_ADD_F32:
- case AMDGPU::V_PK_MUL_F32:
- case AMDGPU::V_PK_FMA_F32:
- return true;
- default:
- return false;
- }
- llvm_unreachable("Fully covered switch");
-}
-
bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
unsigned OpCode = MI.getOpcode();
Register DstReg = MI.getOperand(0).getReg();
@@ -612,10 +592,13 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
MachineInstr &Instr = *I;
+ uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr);
+ bool IsUnpackable =
+ !(UnpackedOpCode == std::numeric_limits<uint16_t>::max());
if (Instr.isMetaInstruction())
continue;
if ((Instr.isTerminator()) ||
- (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) ||
+ (TII->isNeverCoissue(Instr) && !IsUnpackable) ||
(SIInstrInfo::modifiesModeRegister(Instr) &&
Instr.modifiesRegister(AMDGPU::EXEC, TRI)))
return;
@@ -639,7 +622,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
return;
}
- if (!isUnpackingSupportedInstr(Instr))
+ if (!IsUnpackable)
continue;
if (canUnpackingClobberRegister(Instr))
@@ -687,8 +670,8 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
bool IsHiBits) {
MachineBasicBlock &MBB = *I.getParent();
const DebugLoc &DL = I.getDebugLoc();
- const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
- const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
+ const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
+ const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
Register DstReg = I.getOperand(0).getReg();
unsigned OpCode = I.getOpcode();
Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
@@ -702,15 +685,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
NewMI.addDef(UnpackedDstReg); // vdst
- addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1);
- addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2);
+ addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0);
+ addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1);
if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
- const MachineOperand *SrcMO3 =
+ const MachineOperand *SrcMO2 =
TII->getNamedOperand(I, AMDGPU::OpName::src2);
unsigned Src2Mods =
TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm();
- addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3);
+ addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2);
}
NewMI.addImm(ClampVal); // clamp
// Packed instructions do not support output modifiers. safe to assign them 0
@@ -787,9 +770,13 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
// TODO: Fold this into previous block, if possible. Evaluate and handle any
// side effects.
+
+ // Perform the extra MF scans only for supported archs
+ if (!ST.hasGFX940Insts())
+ return Changed;
for (MachineBasicBlock &MBB : MF) {
- // Unpack packed instructions overlapped by MFMAs. This allows the compiler
- // to co-issue unpacked instructions with MFMA
+ // Unpack packed instructions overlapped by MFMAs. This allows the
+ // compiler to co-issue unpacked instructions with MFMA
auto SchedModel = TII->getSchedModel();
SetVector<MachineInstr *> InstrsToUnpack;
for (auto &MI : make_early_inc_range(MBB.instrs())) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index a01a5fd..5e3195b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1697,9 +1697,6 @@ LLVM_READNONE
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi);
LLVM_READNONE
-bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi);
-
-LLVM_READNONE
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi);
LLVM_READNONE
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 67ea2dd..35e1127 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21287,21 +21287,28 @@ bool ARMTargetLowering::useLoadStackGuardNode(const Module &M) const {
}
void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
+ // MSVC CRT provides functionalities for stack protection.
RTLIB::LibcallImpl SecurityCheckCookieLibcall =
getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
- if (SecurityCheckCookieLibcall == RTLIB::Unsupported)
- return TargetLowering::insertSSPDeclarations(M);
- // MSVC CRT has a global variable holding security cookie.
- M.getOrInsertGlobal("__security_cookie",
- PointerType::getUnqual(M.getContext()));
+ RTLIB::LibcallImpl SecurityCookieVar =
+ getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
+ if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
+ SecurityCookieVar != RTLIB::Unsupported) {
+ // MSVC CRT has a global variable holding security cookie.
+ M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
+ PointerType::getUnqual(M.getContext()));
- // MSVC CRT has a function to validate security cookie.
- FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
- getLibcallImplName(SecurityCheckCookieLibcall),
- Type::getVoidTy(M.getContext()), PointerType::getUnqual(M.getContext()));
- if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
- F->addParamAttr(0, Attribute::AttrKind::InReg);
+ // MSVC CRT has a function to validate security cookie.
+ FunctionCallee SecurityCheckCookie =
+ M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
+ Type::getVoidTy(M.getContext()),
+ PointerType::getUnqual(M.getContext()));
+ if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
+ F->addParamAttr(0, Attribute::AttrKind::InReg);
+ }
+
+ TargetLowering::insertSSPDeclarations(M);
}
Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const {
diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp
index 1ce8d7e3..df0c8c1 100644
--- a/llvm/lib/Target/Mips/MipsFastISel.cpp
+++ b/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -264,9 +264,10 @@ public:
} // end anonymous namespace
-static bool CC_Mips(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- Type *OrigTy, CCState &State) LLVM_ATTRIBUTE_UNUSED;
+[[maybe_unused]] static bool CC_Mips(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State);
static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 7f1ff45..2fd7327 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -3176,9 +3176,10 @@ static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT, MVT LocVT,
F64Regs);
}
-static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- Type *OrigTy, CCState &State) LLVM_ATTRIBUTE_UNUSED;
+[[maybe_unused]] static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State);
#include "MipsGenCallingConv.inc"
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 272c21f..2f1a7ad 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -749,7 +749,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setTruncStoreAction(VT, MVT::i1, Expand);
}
- // Disable generations of extload/truncstore for v2i16/v2i8. The generic
+ // Disable generations of extload/truncstore for v2i32/v2i16/v2i8. The generic
// expansion for these nodes when they are unaligned is incorrect if the
// type is a vector.
//
@@ -757,7 +757,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// TargetLowering::expandUnalignedLoad/Store.
setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
MVT::v2i8, Expand);
+ setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i32,
+ {MVT::v2i8, MVT::v2i16}, Expand);
setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
+ setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+ setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
// Register custom handling for illegal type loads/stores. We'll try to custom
// lower almost all illegal types and logic in the lowering will discard cases
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 5ceb477..19992e6 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -695,6 +695,9 @@ def HasStdExtZvfbfa : Predicate<"Subtarget->hasStdExtZvfbfa()">,
def FeatureStdExtZvfbfmin
: RISCVExtension<1, 0, "Vector BF16 Converts", [FeatureStdExtZve32f]>;
+def HasStdExtZvfbfmin : Predicate<"Subtarget->hasStdExtZvfbfmin()">,
+ AssemblerPredicate<(all_of FeatureStdExtZvfbfmin),
+ "'Zvfbfmin' (Vector BF16 Converts)">;
def FeatureStdExtZvfbfwma
: RISCVExtension<1, 0, "Vector BF16 widening mul-add",
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7123a2d..169465e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1672,6 +1672,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
if (Subtarget.useRVVForFixedLengthVectors())
setTargetDAGCombine(ISD::BITCAST);
+ setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
+
// Disable strict node mutation.
IsStrictFPEnabled = true;
EnableExtLdPromotion = true;
@@ -24828,7 +24830,8 @@ bool RISCVTargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
// instruction, as it is usually smaller than the alternative sequence.
// TODO: Add vector division?
bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
- return OptSize && !VT.isVector();
+ return OptSize && !VT.isVector() &&
+ VT.getSizeInBits() <= getMaxDivRemBitWidthSupported();
}
bool RISCVTargetLowering::preferScalarizeSplat(SDNode *N) const {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 65865ce..eb3c9b0 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -5862,20 +5862,6 @@ multiclass VPatConversionWF_VF<string intrinsic, string instruction,
}
}
-multiclass VPatConversionWF_VF_BF<string intrinsic, string instruction,
- bit isSEWAware = 0> {
- foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in
- {
- defvar fvti = fvtiToFWti.Vti;
- defvar fwti = fvtiToFWti.Wti;
- let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
- GetVTypePredicates<fwti>.Predicates) in
- defm : VPatConversion<intrinsic, instruction, "V",
- fwti.Vector, fvti.Vector, fwti.Mask, fvti.Log2SEW,
- fvti.LMul, fwti.RegClass, fvti.RegClass, isSEWAware>;
- }
-}
-
multiclass VPatConversionVI_WF<string intrinsic, string instruction> {
foreach vtiToWti = AllWidenableIntToFloatVectors in {
defvar vti = vtiToWti.Vti;
@@ -5969,20 +5955,6 @@ multiclass VPatConversionVF_WF_RTZ<string intrinsic, string instruction,
}
}
-multiclass VPatConversionVF_WF_BF_RM<string intrinsic, string instruction,
- bit isSEWAware = 0> {
- foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
- defvar fvti = fvtiToFWti.Vti;
- defvar fwti = fvtiToFWti.Wti;
- let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
- GetVTypePredicates<fwti>.Predicates) in
- defm : VPatConversionRoundingMode<intrinsic, instruction, "W",
- fvti.Vector, fwti.Vector, fvti.Mask, fvti.Log2SEW,
- fvti.LMul, fvti.RegClass, fwti.RegClass,
- isSEWAware>;
- }
-}
-
multiclass VPatCompare_VI<string intrinsic, string inst,
ImmLeaf ImmType> {
foreach vti = AllIntegerVectors in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
index 0be9eab..c9c1246 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
@@ -36,7 +36,7 @@ defm VFWMACCBF16_V : VWMAC_FV_V_F<"vfwmaccbf16", 0b111011>;
//===----------------------------------------------------------------------===//
// Pseudo instructions
//===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in {
+let Predicates = [HasStdExtZvfbfmin] in {
defm PseudoVFWCVTBF16_F_F : VPseudoVWCVTD_V;
defm PseudoVFNCVTBF16_F_F : VPseudoVNCVTD_W_RM;
}
@@ -47,7 +47,31 @@ let mayRaiseFPException = true, Predicates = [HasStdExtZvfbfwma] in
//===----------------------------------------------------------------------===//
// Patterns
//===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in {
+multiclass VPatConversionWF_VF_BF<string intrinsic, string instruction,
+ bit isSEWAware = 0> {
+ foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in
+ {
+ defvar fvti = fvtiToFWti.Vti;
+ defvar fwti = fvtiToFWti.Wti;
+ defm : VPatConversion<intrinsic, instruction, "V",
+ fwti.Vector, fvti.Vector, fwti.Mask, fvti.Log2SEW,
+ fvti.LMul, fwti.RegClass, fvti.RegClass, isSEWAware>;
+ }
+}
+
+multiclass VPatConversionVF_WF_BF_RM<string intrinsic, string instruction,
+ bit isSEWAware = 0> {
+ foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
+ defvar fvti = fvtiToFWti.Vti;
+ defvar fwti = fvtiToFWti.Wti;
+ defm : VPatConversionRoundingMode<intrinsic, instruction, "W",
+ fvti.Vector, fwti.Vector, fvti.Mask, fvti.Log2SEW,
+ fvti.LMul, fvti.RegClass, fwti.RegClass,
+ isSEWAware>;
+ }
+}
+
+let Predicates = [HasStdExtZvfbfmin] in {
defm : VPatConversionWF_VF_BF<"int_riscv_vfwcvtbf16_f_f_v",
"PseudoVFWCVTBF16_F_F", isSEWAware=1>;
defm : VPatConversionVF_WF_BF_RM<"int_riscv_vfncvtbf16_f_f_w",
@@ -56,7 +80,6 @@ let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in {
foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
defvar fvti = fvtiToFWti.Vti;
defvar fwti = fvtiToFWti.Wti;
- let Predicates = [HasVInstructionsBF16Minimal] in
def : Pat<(fwti.Vector (any_riscv_fpextend_vl
(fvti.Vector fvti.RegClass:$rs1),
(fvti.Mask VMV0:$vm),
@@ -66,18 +89,16 @@ let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in {
(fvti.Mask VMV0:$vm),
GPR:$vl, fvti.Log2SEW, TA_MA)>;
- let Predicates = [HasVInstructionsBF16Minimal] in
- def : Pat<(fvti.Vector (any_riscv_fpround_vl
- (fwti.Vector fwti.RegClass:$rs1),
- (fwti.Mask VMV0:$vm), VLOpFrag)),
- (!cast<Instruction>("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
- (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
- (fwti.Mask VMV0:$vm),
- // Value to indicate no rounding mode change in
- // RISCVInsertReadWriteCSR
- FRM_DYN,
- GPR:$vl, fvti.Log2SEW, TA_MA)>;
- let Predicates = [HasVInstructionsBF16Minimal] in
+ def : Pat<(fvti.Vector (any_riscv_fpround_vl
+ (fwti.Vector fwti.RegClass:$rs1),
+ (fwti.Mask VMV0:$vm), VLOpFrag)),
+ (!cast<Instruction>("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
+ (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
+ (fwti.Mask VMV0:$vm),
+ // Value to indicate no rounding mode change in
+ // RISCVInsertReadWriteCSR
+ FRM_DYN,
+ GPR:$vl, fvti.Log2SEW, TA_MA)>;
def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))),
(!cast<Instruction>("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW)
(fvti.Vector (IMPLICIT_DEF)),
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 56a6168..640b014 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -78,6 +78,8 @@ public:
void outputExecutionModeFromNumthreadsAttribute(
const MCRegister &Reg, const Attribute &Attr,
SPIRV::ExecutionMode::ExecutionMode EM);
+ void outputExecutionModeFromEnableMaximalReconvergenceAttr(
+ const MCRegister &Reg, const SPIRVSubtarget &ST);
void outputExecutionMode(const Module &M);
void outputAnnotations(const Module &M);
void outputModuleSections();
@@ -495,6 +497,20 @@ void SPIRVAsmPrinter::outputExecutionModeFromNumthreadsAttribute(
outputMCInst(Inst);
}
+void SPIRVAsmPrinter::outputExecutionModeFromEnableMaximalReconvergenceAttr(
+ const MCRegister &Reg, const SPIRVSubtarget &ST) {
+ assert(ST.canUseExtension(SPIRV::Extension::SPV_KHR_maximal_reconvergence) &&
+ "Function called when SPV_KHR_maximal_reconvergence is not enabled.");
+
+ MCInst Inst;
+ Inst.setOpcode(SPIRV::OpExecutionMode);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ unsigned EM =
+ static_cast<unsigned>(SPIRV::ExecutionMode::MaximallyReconvergesKHR);
+ Inst.addOperand(MCOperand::createImm(EM));
+ outputMCInst(Inst);
+}
+
void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
NamedMDNode *Node = M.getNamedMetadata("spirv.ExecutionMode");
if (Node) {
@@ -551,6 +567,10 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
if (Attribute Attr = F.getFnAttribute("hlsl.numthreads"); Attr.isValid())
outputExecutionModeFromNumthreadsAttribute(
FReg, Attr, SPIRV::ExecutionMode::LocalSize);
+ if (Attribute Attr = F.getFnAttribute("enable-maximal-reconvergence");
+ Attr.getValueAsBool()) {
+ outputExecutionModeFromEnableMaximalReconvergenceAttr(FReg, *ST);
+ }
if (MDNode *Node = F.getMetadata("work_group_size_hint"))
outputExecutionModeFromMDNode(FReg, Node,
SPIRV::ExecutionMode::LocalSizeHint, 3, 1);
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 5f3ed86..96f5dee 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -153,7 +153,9 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>>
SPIRV::Extension::Extension::
SPV_EXT_relaxed_printf_string_address_space},
{"SPV_INTEL_predicated_io",
- SPIRV::Extension::Extension::SPV_INTEL_predicated_io}};
+ SPIRV::Extension::Extension::SPV_INTEL_predicated_io},
+ {"SPV_KHR_maximal_reconvergence",
+ SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence}};
bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName,
StringRef ArgValue,
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index c6c6182..a151fd2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -1392,19 +1392,19 @@ void SPIRVEmitIntrinsics::preprocessCompositeConstants(IRBuilder<> &B) {
Constant *AggrConst = nullptr;
Type *ResTy = nullptr;
if (auto *COp = dyn_cast<ConstantVector>(Op)) {
- AggrConst = cast<Constant>(COp);
+ AggrConst = COp;
ResTy = COp->getType();
} else if (auto *COp = dyn_cast<ConstantArray>(Op)) {
- AggrConst = cast<Constant>(COp);
+ AggrConst = COp;
ResTy = B.getInt32Ty();
} else if (auto *COp = dyn_cast<ConstantStruct>(Op)) {
- AggrConst = cast<Constant>(COp);
+ AggrConst = COp;
ResTy = B.getInt32Ty();
} else if (auto *COp = dyn_cast<ConstantDataArray>(Op)) {
- AggrConst = cast<Constant>(COp);
+ AggrConst = COp;
ResTy = B.getInt32Ty();
} else if (auto *COp = dyn_cast<ConstantAggregateZero>(Op)) {
- AggrConst = cast<Constant>(COp);
+ AggrConst = COp;
ResTy = Op->getType()->isVectorTy() ? COp->getType() : B.getInt32Ty();
}
if (AggrConst) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 5144fb1..61a0bbe 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1200,6 +1200,23 @@ void addOpAccessChainReqs(const MachineInstr &Instr,
return;
}
+ bool IsNonUniform =
+ hasNonUniformDecoration(Instr.getOperand(0).getReg(), MRI);
+
+ auto FirstIndexReg = Instr.getOperand(3).getReg();
+ bool FirstIndexIsConstant =
+ Subtarget.getInstrInfo()->isConstantInstr(*MRI.getVRegDef(FirstIndexReg));
+
+ if (StorageClass == SPIRV::StorageClass::StorageClass::StorageBuffer) {
+ if (IsNonUniform)
+ Handler.addRequirements(
+ SPIRV::Capability::StorageBufferArrayNonUniformIndexingEXT);
+ else if (!FirstIndexIsConstant)
+ Handler.addRequirements(
+ SPIRV::Capability::StorageBufferArrayDynamicIndexing);
+ return;
+ }
+
Register PointeeTypeReg = ResTypeInst->getOperand(2).getReg();
MachineInstr *PointeeType = MRI.getUniqueVRegDef(PointeeTypeReg);
if (PointeeType->getOpcode() != SPIRV::OpTypeImage &&
@@ -1208,27 +1225,25 @@ void addOpAccessChainReqs(const MachineInstr &Instr,
return;
}
- bool IsNonUniform =
- hasNonUniformDecoration(Instr.getOperand(0).getReg(), MRI);
if (isUniformTexelBuffer(PointeeType)) {
if (IsNonUniform)
Handler.addRequirements(
SPIRV::Capability::UniformTexelBufferArrayNonUniformIndexingEXT);
- else
+ else if (!FirstIndexIsConstant)
Handler.addRequirements(
SPIRV::Capability::UniformTexelBufferArrayDynamicIndexingEXT);
} else if (isInputAttachment(PointeeType)) {
if (IsNonUniform)
Handler.addRequirements(
SPIRV::Capability::InputAttachmentArrayNonUniformIndexingEXT);
- else
+ else if (!FirstIndexIsConstant)
Handler.addRequirements(
SPIRV::Capability::InputAttachmentArrayDynamicIndexingEXT);
} else if (isStorageTexelBuffer(PointeeType)) {
if (IsNonUniform)
Handler.addRequirements(
SPIRV::Capability::StorageTexelBufferArrayNonUniformIndexingEXT);
- else
+ else if (!FirstIndexIsConstant)
Handler.addRequirements(
SPIRV::Capability::StorageTexelBufferArrayDynamicIndexingEXT);
} else if (isSampledImage(PointeeType) ||
@@ -1237,14 +1252,14 @@ void addOpAccessChainReqs(const MachineInstr &Instr,
if (IsNonUniform)
Handler.addRequirements(
SPIRV::Capability::SampledImageArrayNonUniformIndexingEXT);
- else
+ else if (!FirstIndexIsConstant)
Handler.addRequirements(
SPIRV::Capability::SampledImageArrayDynamicIndexing);
} else if (isStorageImage(PointeeType)) {
if (IsNonUniform)
Handler.addRequirements(
SPIRV::Capability::StorageImageArrayNonUniformIndexingEXT);
- else
+ else if (!FirstIndexIsConstant)
Handler.addRequirements(
SPIRV::Capability::StorageImageArrayDynamicIndexing);
}
@@ -2155,6 +2170,9 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
SPIRV::OperandCategory::ExecutionModeOperand,
SPIRV::ExecutionMode::LocalSize, ST);
}
+ if (F.getFnAttribute("enable-maximal-reconvergence").getValueAsBool()) {
+ MAI.Reqs.addExtension(SPIRV::Extension::SPV_KHR_maximal_reconvergence);
+ }
if (F.getMetadata("work_group_size_hint"))
MAI.Reqs.getAndAddRequirements(
SPIRV::OperandCategory::ExecutionModeOperand,
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 2625642..7d08b29 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -386,6 +386,7 @@ defm SPV_KHR_float_controls2 : ExtensionOperand<124, [EnvVulkan, EnvOpenCL]>;
defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125, [EnvOpenCL]>;
defm SPV_KHR_bfloat16 : ExtensionOperand<126, [EnvVulkan, EnvOpenCL]>;
defm SPV_INTEL_predicated_io : ExtensionOperand<127, [EnvOpenCL]>;
+defm SPV_KHR_maximal_reconvergence : ExtensionOperand<128, [EnvVulkan]>;
//===----------------------------------------------------------------------===//
// Multiclass used to define Capabilities enum values and at the same time
@@ -698,7 +699,7 @@ defm IntersectionNV: ExecutionModelOperand<5314, [RayTracingNV]>;
defm AnyHitNV: ExecutionModelOperand<5315, [RayTracingNV]>;
defm ClosestHitNV: ExecutionModelOperand<5316, [RayTracingNV]>;
defm MissNV: ExecutionModelOperand<5317, [RayTracingNV]>;
-defm CallableNV: ExecutionModelOperand<5318, [RayTracingNV]>;
+defm CallableNV : ExecutionModelOperand<5318, [RayTracingNV]>;
//===----------------------------------------------------------------------===//
// Multiclass used to define MemoryModel enum values and at the same time
@@ -805,6 +806,7 @@ defm RoundingModeRTNINTEL : ExecutionModeOperand<5621, [RoundToInfinityINTEL]>;
defm FloatingPointModeALTINTEL : ExecutionModeOperand<5622, [FloatingPointModeINTEL]>;
defm FloatingPointModeIEEEINTEL : ExecutionModeOperand<5623, [FloatingPointModeINTEL]>;
defm FPFastMathDefault : ExecutionModeOperand<6028, [FloatControls2]>;
+defm MaximallyReconvergesKHR : ExecutionModeOperand<6023, [Shader]>;
//===----------------------------------------------------------------------===//
// Multiclass used to define StorageClass enum values and at the same time
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index eedfdb3..ed54404d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1763,6 +1763,26 @@ defm RELAXED_DOT :
"i16x8.relaxed_dot_i8x16_i7x16_s\t$dst, $lhs, $rhs",
"i16x8.relaxed_dot_i8x16_i7x16_s", 0x112>;
+def : Pat<
+ (v8i16 (add
+ (wasm_shuffle
+ (v8i16 (extmul_low_s v16i8:$lhs, v16i8:$rhs)),
+ (v8i16 (extmul_high_s v16i8:$lhs, v16i8:$rhs)),
+ (i32 0), (i32 1), (i32 4), (i32 5),
+ (i32 8), (i32 9), (i32 12), (i32 13),
+ (i32 16), (i32 17), (i32 20), (i32 21),
+ (i32 24), (i32 25), (i32 28), (i32 29)),
+ (wasm_shuffle
+ (v8i16 (extmul_low_s v16i8:$lhs, v16i8:$rhs)),
+ (v8i16 (extmul_high_s v16i8:$lhs, v16i8:$rhs)),
+ (i32 2), (i32 3), (i32 6), (i32 7),
+ (i32 10), (i32 11), (i32 14), (i32 15),
+ (i32 18), (i32 19), (i32 22), (i32 23),
+ (i32 26), (i32 27), (i32 30), (i32 31)))
+ ),
+ (v8i16 (RELAXED_DOT v16i8:$lhs, v16i8:$rhs))
+>;
+
defm RELAXED_DOT_ADD :
RELAXED_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs, V128:$acc),
(outs), (ins),
@@ -1771,6 +1791,14 @@ defm RELAXED_DOT_ADD :
"i32x4.relaxed_dot_i8x16_i7x16_add_s\t$dst, $lhs, $rhs, $acc",
"i32x4.relaxed_dot_i8x16_i7x16_add_s", 0x113>;
+def : Pat<
+ (v4i32 (add
+ (v4i32 (int_wasm_extadd_pairwise_signed
+ (v8i16 (int_wasm_relaxed_dot_i8x16_i7x16_signed v16i8:$lhs, v16i8:$rhs)))),
+ (v4i32 V128:$acc))),
+ (v4i32 (RELAXED_DOT_ADD v16i8:$lhs, v16i8:$rhs, (v4i32 V128:$acc)))
+ >;
+
def : Pat<(v4i32 (partial_reduce_smla (v4i32 V128:$acc), (v16i8 V128:$lhs),
(v16i8 V128:$rhs))),
(RELAXED_DOT_ADD $lhs, $rhs, $acc)>, Requires<[HasRelaxedSIMD]>;
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index b81641f..28fa2cd 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -414,8 +414,6 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
getActionDefinitionsBuilder(G_SEXT_INREG).lower();
- getActionDefinitionsBuilder(G_IS_FPCLASS).lower();
-
// fp constants
getActionDefinitionsBuilder(G_FCONSTANT)
.legalFor({s32, s64})
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 6db780f..8e08d16 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1338,6 +1338,10 @@ def ProcessorFeatures {
list<SubtargetFeature> PTLFeatures =
!listremove(ARLSFeatures, [FeatureWIDEKL]);
+ // Novalake
+ list<SubtargetFeature> NVLFeatures =
+ !listconcat(PTLFeatures, [FeaturePREFETCHI]);
+
// Clearwaterforest
list<SubtargetFeature> CWFAdditionalFeatures = [FeaturePREFETCHI,
FeatureAVXVNNIINT16,
@@ -1883,6 +1887,9 @@ foreach P = ["pantherlake", "wildcatlake"] in {
def : ProcModel<P, AlderlakePModel,
ProcessorFeatures.PTLFeatures, ProcessorFeatures.ADLTuning>;
}
+def : ProcModel<"novalake", AlderlakePModel, ProcessorFeatures.NVLFeatures,
+ ProcessorFeatures.ADLTuning>;
+
def : ProcModel<"clearwaterforest", AlderlakePModel,
ProcessorFeatures.CWFFeatures, ProcessorFeatures.ADLTuning>;
def : ProcModel<"emeraldrapids", SapphireRapidsModel,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a0b64ff..2feee05 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29755,65 +29755,30 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
const X86Subtarget &Subtarget,
SelectionDAG &DAG,
SDValue *Low = nullptr) {
- unsigned NumElts = VT.getVectorNumElements();
-
// For vXi8 we will unpack the low and high half of each 128 bit lane to widen
// to a vXi16 type. Do the multiplies, shift the results and pack the half
// lane results back together.
// We'll take different approaches for signed and unsigned.
- // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
- // and use pmullw to calculate the full 16-bit product.
+ // For unsigned we'll use punpcklbw/punpckhbw to zero extend the bytes to
+ // words and use pmullw to calculate the full 16-bit product.
// For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
// shift them left into the upper byte of each word. This allows us to use
// pmulhw to calculate the full 16-bit product. This trick means we don't
// need to sign extend the bytes to use pmullw.
-
- MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
+ MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
SDValue Zero = DAG.getConstant(0, dl, VT);
- SDValue ALo, AHi;
+ SDValue ALo, AHi, BLo, BHi;
if (IsSigned) {
ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
- AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
- } else {
- ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
- AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
- }
-
- SDValue BLo, BHi;
- if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
- // If the RHS is a constant, manually unpackl/unpackh and extend.
- SmallVector<SDValue, 16> LoOps, HiOps;
- for (unsigned i = 0; i != NumElts; i += 16) {
- for (unsigned j = 0; j != 8; ++j) {
- SDValue LoOp = B.getOperand(i + j);
- SDValue HiOp = B.getOperand(i + j + 8);
-
- if (IsSigned) {
- LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
- HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
- LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
- DAG.getConstant(8, dl, MVT::i16));
- HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
- DAG.getConstant(8, dl, MVT::i16));
- } else {
- LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
- HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
- }
-
- LoOps.push_back(LoOp);
- HiOps.push_back(HiOp);
- }
- }
-
- BLo = DAG.getBuildVector(ExVT, dl, LoOps);
- BHi = DAG.getBuildVector(ExVT, dl, HiOps);
- } else if (IsSigned) {
BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
+ AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
} else {
+ ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
+ AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
}
@@ -29826,7 +29791,7 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
if (Low)
*Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
- return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
+ return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf=*/true);
}
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
@@ -47761,6 +47726,15 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
DL, DAG, Subtarget))
return V;
+ // If the sign bit is known then BLENDV can be folded away.
+ if (N->getOpcode() == X86ISD::BLENDV) {
+ KnownBits KnownCond = DAG.computeKnownBits(Cond);
+ if (KnownCond.isNegative())
+ return LHS;
+ if (KnownCond.isNonNegative())
+ return RHS;
+ }
+
if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
SmallVector<int, 64> CondMask;
if (createShuffleMaskFromVSELECT(CondMask, Cond,
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 6dd43b2..37d7772 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -606,16 +606,24 @@ Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
void X86TargetLowering::insertSSPDeclarations(Module &M) const {
// MSVC CRT provides functionalities for stack protection.
- if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
- Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
+ RTLIB::LibcallImpl SecurityCheckCookieLibcall =
+ getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
+
+ RTLIB::LibcallImpl SecurityCookieVar =
+ getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
+ if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
+ SecurityCookieVar != RTLIB::Unsupported) {
+ // MSVC CRT provides functionalities for stack protection.
// MSVC CRT has a global variable holding security cookie.
- M.getOrInsertGlobal("__security_cookie",
+ M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
PointerType::getUnqual(M.getContext()));
// MSVC CRT has a function to validate security cookie.
- FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
- "__security_check_cookie", Type::getVoidTy(M.getContext()),
- PointerType::getUnqual(M.getContext()));
+ FunctionCallee SecurityCheckCookie =
+ M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
+ Type::getVoidTy(M.getContext()),
+ PointerType::getUnqual(M.getContext()));
+
if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
F->setCallingConv(CallingConv::X86_FastCall);
F->addParamAttr(0, Attribute::AttrKind::InReg);
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 1d2cd39..5c23f91 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -10809,39 +10809,27 @@ void X86InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
if (!ST.hasSSE1())
return;
- // PXOR is safe to use because it doesn't affect flags.
- BuildMI(MBB, Iter, DL, get(X86::PXORrr), Reg)
- .addReg(Reg, RegState::Undef)
- .addReg(Reg, RegState::Undef);
+ BuildMI(MBB, Iter, DL, get(X86::V_SET0), Reg);
} else if (X86::VR256RegClass.contains(Reg)) {
// YMM#
if (!ST.hasAVX())
return;
- // VPXOR is safe to use because it doesn't affect flags.
- BuildMI(MBB, Iter, DL, get(X86::VPXORrr), Reg)
- .addReg(Reg, RegState::Undef)
- .addReg(Reg, RegState::Undef);
+ BuildMI(MBB, Iter, DL, get(X86::AVX_SET0), Reg);
} else if (X86::VR512RegClass.contains(Reg)) {
// ZMM#
if (!ST.hasAVX512())
return;
- // VPXORY is safe to use because it doesn't affect flags.
- BuildMI(MBB, Iter, DL, get(X86::VPXORYrr), Reg)
- .addReg(Reg, RegState::Undef)
- .addReg(Reg, RegState::Undef);
+ BuildMI(MBB, Iter, DL, get(X86::AVX512_512_SET0), Reg);
} else if (X86::VK1RegClass.contains(Reg) || X86::VK2RegClass.contains(Reg) ||
X86::VK4RegClass.contains(Reg) || X86::VK8RegClass.contains(Reg) ||
X86::VK16RegClass.contains(Reg)) {
if (!ST.hasVLX())
return;
- // KXOR is safe to use because it doesn't affect flags.
- unsigned Op = ST.hasBWI() ? X86::KXORQkk : X86::KXORWkk;
- BuildMI(MBB, Iter, DL, get(Op), Reg)
- .addReg(Reg, RegState::Undef)
- .addReg(Reg, RegState::Undef);
+ unsigned Op = ST.hasBWI() ? X86::KSET0Q : X86::KSET0W;
+ BuildMI(MBB, Iter, DL, get(Op), Reg);
}
}
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 1fca466f..713d504 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1928,6 +1928,17 @@ static void addConstantComments(const MachineInstr *MI,
#define INSTR_CASE(Prefix, Instr, Suffix, Postfix) \
case X86::Prefix##Instr##Suffix##rm##Postfix:
+#define CASE_AVX512_ARITH_RM(Instr) \
+ INSTR_CASE(V, Instr, Z128, ) \
+ INSTR_CASE(V, Instr, Z128, k) \
+ INSTR_CASE(V, Instr, Z128, kz) \
+ INSTR_CASE(V, Instr, Z256, ) \
+ INSTR_CASE(V, Instr, Z256, k) \
+ INSTR_CASE(V, Instr, Z256, kz) \
+ INSTR_CASE(V, Instr, Z, ) \
+ INSTR_CASE(V, Instr, Z, k) \
+ INSTR_CASE(V, Instr, Z, kz)
+
#define CASE_ARITH_RM(Instr) \
INSTR_CASE(, Instr, , ) /* SSE */ \
INSTR_CASE(V, Instr, , ) /* AVX-128 */ \
@@ -1943,22 +1954,12 @@ static void addConstantComments(const MachineInstr *MI,
INSTR_CASE(V, Instr, Z, kz)
// TODO: Add additional instructions when useful.
- CASE_ARITH_RM(PMADDUBSW) {
- unsigned SrcIdx = getSrcIdx(MI, 1);
- if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) {
- std::string Comment;
- raw_string_ostream CS(Comment);
- unsigned VectorWidth =
- X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
- CS << "[";
- printConstant(C, VectorWidth, CS);
- CS << "]";
- OutStreamer.AddComment(CS.str());
- }
- break;
- }
-
+ CASE_ARITH_RM(PMADDUBSW)
CASE_ARITH_RM(PMADDWD)
+ CASE_ARITH_RM(PMULDQ)
+ CASE_ARITH_RM(PMULUDQ)
+ CASE_ARITH_RM(PMULLD)
+ CASE_AVX512_ARITH_RM(PMULLQ)
CASE_ARITH_RM(PMULLW)
CASE_ARITH_RM(PMULHW)
CASE_ARITH_RM(PMULHUW)