aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/X86
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/X86')
-rw-r--r--llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp74
-rw-r--r--llvm/lib/Target/X86/CMakeLists.txt2
-rw-r--r--llvm/lib/Target/X86/GISel/X86CallLowering.cpp3
-rw-r--r--llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp47
-rw-r--r--llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp1
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp14
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp2
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp2
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp2
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp5
-rw-r--r--llvm/lib/Target/X86/X86.h67
-rw-r--r--llvm/lib/Target/X86/X86.td14
-rw-r--r--llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp6
-rw-r--r--llvm/lib/Target/X86/X86AvoidTrailingCall.cpp33
-rw-r--r--llvm/lib/Target/X86/X86CompressEVEX.cpp12
-rw-r--r--llvm/lib/Target/X86/X86DiscriminateMemOps.cpp184
-rw-r--r--llvm/lib/Target/X86/X86DomainReassignment.cpp4
-rw-r--r--llvm/lib/Target/X86/X86DynAllocaExpander.cpp42
-rw-r--r--llvm/lib/Target/X86/X86ExpandPseudo.cpp24
-rw-r--r--llvm/lib/Target/X86/X86FastPreTileConfig.cpp3
-rw-r--r--llvm/lib/Target/X86/X86FixupBWInsts.cpp3
-rw-r--r--llvm/lib/Target/X86/X86FlagsCopyLowering.cpp1
-rw-r--r--llvm/lib/Target/X86/X86FloatingPoint.cpp1064
-rw-r--r--llvm/lib/Target/X86/X86FrameLowering.cpp7
-rw-r--r--llvm/lib/Target/X86/X86ISelDAGToDAG.cpp8
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp833
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h6
-rw-r--r--llvm/lib/Target/X86/X86ISelLoweringCall.cpp2
-rw-r--r--llvm/lib/Target/X86/X86InsertPrefetch.cpp259
-rw-r--r--llvm/lib/Target/X86/X86InstrAMX.td24
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX512.td40
-rw-r--r--llvm/lib/Target/X86/X86InstrCMovSetCC.td6
-rw-r--r--llvm/lib/Target/X86/X86InstrFragmentsSIMD.td2
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.cpp101
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.h14
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td34
-rw-r--r--llvm/lib/Target/X86/X86IntrinsicsInfo.h6
-rw-r--r--llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp19
-rw-r--r--llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp48
-rw-r--r--llvm/lib/Target/X86/X86OptimizeLEAs.cpp2
-rw-r--r--llvm/lib/Target/X86/X86PartialReduction.cpp72
-rw-r--r--llvm/lib/Target/X86/X86PassRegistry.def10
-rw-r--r--llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp3
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.cpp1
-rw-r--r--llvm/lib/Target/X86/X86TargetMachine.cpp21
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp73
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.h30
-rw-r--r--llvm/lib/Target/X86/X86VZeroUpper.cpp2
48 files changed, 1778 insertions, 1454 deletions
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index b7ea672..bac3692 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -2470,10 +2470,10 @@ bool X86AsmParser::ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID,
// Report back its kind, or IOK_INVALID if does not evaluated as a known one
unsigned X86AsmParser::IdentifyIntelInlineAsmOperator(StringRef Name) {
return StringSwitch<unsigned>(Name)
- .Cases("TYPE","type",IOK_TYPE)
- .Cases("SIZE","size",IOK_SIZE)
- .Cases("LENGTH","length",IOK_LENGTH)
- .Default(IOK_INVALID);
+ .Cases({"TYPE", "type"}, IOK_TYPE)
+ .Cases({"SIZE", "size"}, IOK_SIZE)
+ .Cases({"LENGTH", "length"}, IOK_LENGTH)
+ .Default(IOK_INVALID);
}
/// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators. The LENGTH operator
@@ -2516,8 +2516,8 @@ unsigned X86AsmParser::ParseIntelInlineAsmOperator(unsigned OpKind) {
unsigned X86AsmParser::IdentifyMasmOperator(StringRef Name) {
return StringSwitch<unsigned>(Name.lower())
.Case("type", MOK_TYPE)
- .Cases("size", "sizeof", MOK_SIZEOF)
- .Cases("length", "lengthof", MOK_LENGTHOF)
+ .Cases({"size", "sizeof"}, MOK_SIZEOF)
+ .Cases({"length", "lengthof"}, MOK_LENGTHOF)
.Default(MOK_INVALID);
}
@@ -2581,21 +2581,21 @@ bool X86AsmParser::ParseMasmOperator(unsigned OpKind, int64_t &Val) {
bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size,
StringRef *SizeStr) {
Size = StringSwitch<unsigned>(getTok().getString())
- .Cases("BYTE", "byte", 8)
- .Cases("WORD", "word", 16)
- .Cases("DWORD", "dword", 32)
- .Cases("FLOAT", "float", 32)
- .Cases("LONG", "long", 32)
- .Cases("FWORD", "fword", 48)
- .Cases("DOUBLE", "double", 64)
- .Cases("QWORD", "qword", 64)
- .Cases("MMWORD","mmword", 64)
- .Cases("XWORD", "xword", 80)
- .Cases("TBYTE", "tbyte", 80)
- .Cases("XMMWORD", "xmmword", 128)
- .Cases("YMMWORD", "ymmword", 256)
- .Cases("ZMMWORD", "zmmword", 512)
- .Default(0);
+ .Cases({"BYTE", "byte"}, 8)
+ .Cases({"WORD", "word"}, 16)
+ .Cases({"DWORD", "dword"}, 32)
+ .Cases({"FLOAT", "float"}, 32)
+ .Cases({"LONG", "long"}, 32)
+ .Cases({"FWORD", "fword"}, 48)
+ .Cases({"DOUBLE", "double"}, 64)
+ .Cases({"QWORD", "qword"}, 64)
+ .Cases({"MMWORD", "mmword"}, 64)
+ .Cases({"XWORD", "xword"}, 80)
+ .Cases({"TBYTE", "tbyte"}, 80)
+ .Cases({"XMMWORD", "xmmword"}, 128)
+ .Cases({"YMMWORD", "ymmword"}, 256)
+ .Cases({"ZMMWORD", "zmmword"}, 512)
+ .Default(0);
if (Size) {
if (SizeStr)
*SizeStr = getTok().getString();
@@ -2886,22 +2886,22 @@ bool X86AsmParser::parseATTOperand(OperandVector &Operands) {
// otherwise the EFLAGS Condition Code enumerator.
X86::CondCode X86AsmParser::ParseConditionCode(StringRef CC) {
return StringSwitch<X86::CondCode>(CC)
- .Case("o", X86::COND_O) // Overflow
- .Case("no", X86::COND_NO) // No Overflow
- .Cases("b", "nae", X86::COND_B) // Below/Neither Above nor Equal
- .Cases("ae", "nb", X86::COND_AE) // Above or Equal/Not Below
- .Cases("e", "z", X86::COND_E) // Equal/Zero
- .Cases("ne", "nz", X86::COND_NE) // Not Equal/Not Zero
- .Cases("be", "na", X86::COND_BE) // Below or Equal/Not Above
- .Cases("a", "nbe", X86::COND_A) // Above/Neither Below nor Equal
- .Case("s", X86::COND_S) // Sign
- .Case("ns", X86::COND_NS) // No Sign
- .Cases("p", "pe", X86::COND_P) // Parity/Parity Even
- .Cases("np", "po", X86::COND_NP) // No Parity/Parity Odd
- .Cases("l", "nge", X86::COND_L) // Less/Neither Greater nor Equal
- .Cases("ge", "nl", X86::COND_GE) // Greater or Equal/Not Less
- .Cases("le", "ng", X86::COND_LE) // Less or Equal/Not Greater
- .Cases("g", "nle", X86::COND_G) // Greater/Neither Less nor Equal
+ .Case("o", X86::COND_O) // Overflow
+ .Case("no", X86::COND_NO) // No Overflow
+ .Cases({"b", "nae"}, X86::COND_B) // Below/Neither Above nor Equal
+ .Cases({"ae", "nb"}, X86::COND_AE) // Above or Equal/Not Below
+ .Cases({"e", "z"}, X86::COND_E) // Equal/Zero
+ .Cases({"ne", "nz"}, X86::COND_NE) // Not Equal/Not Zero
+ .Cases({"be", "na"}, X86::COND_BE) // Below or Equal/Not Above
+ .Cases({"a", "nbe"}, X86::COND_A) // Above/Neither Below nor Equal
+ .Case("s", X86::COND_S) // Sign
+ .Case("ns", X86::COND_NS) // No Sign
+ .Cases({"p", "pe"}, X86::COND_P) // Parity/Parity Even
+ .Cases({"np", "po"}, X86::COND_NP) // No Parity/Parity Odd
+ .Cases({"l", "nge"}, X86::COND_L) // Less/Neither Greater nor Equal
+ .Cases({"ge", "nl"}, X86::COND_GE) // Greater or Equal/Not Less
+ .Cases({"le", "ng"}, X86::COND_LE) // Less or Equal/Not Greater
+ .Cases({"g", "nle"}, X86::COND_G) // Greater/Neither Less nor Equal
.Default(X86::COND_INVALID);
}
diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index f9bd233..434a6d2 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -31,7 +31,6 @@ set(sources
X86CmovConversion.cpp
X86CodeGenPassBuilder.cpp
X86DomainReassignment.cpp
- X86DiscriminateMemOps.cpp
X86LowerTileCopy.cpp
X86LowerAMXType.cpp
X86LowerAMXIntrinsics.cpp
@@ -57,7 +56,6 @@ set(sources
X86IndirectBranchTracking.cpp
X86IndirectThunks.cpp
X86InterleavedAccess.cpp
- X86InsertPrefetch.cpp
X86InstCombineIntrinsic.cpp
X86InstrFMA3Info.cpp
X86InstrFoldTables.cpp
diff --git a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
index c0b9339..b07ce2b 100644
--- a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
+++ b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp
@@ -280,8 +280,7 @@ bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
if (Arg.hasAttribute(Attribute::ByVal) ||
Arg.hasAttribute(Attribute::InReg) ||
Arg.hasAttribute(Attribute::SwiftSelf) ||
- Arg.hasAttribute(Attribute::SwiftError) ||
- Arg.hasAttribute(Attribute::Nest) || VRegs[Idx].size() > 1)
+ Arg.hasAttribute(Attribute::SwiftError) || VRegs[Idx].size() > 1)
return false;
if (Arg.hasAttribute(Attribute::StructRet)) {
diff --git a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
index 53ec712..f499e6f 100644
--- a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
+++ b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
@@ -312,6 +312,53 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
}
}
+ // Special case GPR16 -> XMM
+ if (SrcSize == 16 && SrcRegBank.getID() == X86::GPRRegBankID &&
+ (DstRegBank.getID() == X86::VECRRegBankID)) {
+
+ const DebugLoc &DL = I.getDebugLoc();
+
+ // Any extend GPR16 -> GPR32
+ Register ExtReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+ BuildMI(*I.getParent(), I, DL, TII.get(TargetOpcode::SUBREG_TO_REG),
+ ExtReg)
+ .addImm(0)
+ .addReg(SrcReg)
+ .addImm(X86::sub_16bit);
+
+ // Copy GR32 -> XMM
+ BuildMI(*I.getParent(), I, DL, TII.get(TargetOpcode::COPY), DstReg)
+ .addReg(ExtReg);
+
+ I.eraseFromParent();
+ }
+
+ // Special case XMM -> GR16
+ if (DstSize == 16 && DstRegBank.getID() == X86::GPRRegBankID &&
+ (SrcRegBank.getID() == X86::VECRRegBankID)) {
+
+ const DebugLoc &DL = I.getDebugLoc();
+
+ // Move XMM to GR32 register.
+ Register Temp32 = MRI.createVirtualRegister(&X86::GR32RegClass);
+ BuildMI(*I.getParent(), I, DL, TII.get(TargetOpcode::COPY), Temp32)
+ .addReg(SrcReg);
+
+ // Extract the lower 16 bits
+ if (Register Dst32 = TRI.getMatchingSuperReg(DstReg, X86::sub_16bit,
+ &X86::GR32RegClass)) {
+ // Optimization for Physical Dst (e.g. AX): Copy to EAX directly.
+ BuildMI(*I.getParent(), I, DL, TII.get(TargetOpcode::COPY), Dst32)
+ .addReg(Temp32);
+ } else {
+ // Handle if there is no super.
+ BuildMI(*I.getParent(), I, DL, TII.get(TargetOpcode::COPY), DstReg)
+ .addReg(Temp32, 0, X86::sub_16bit);
+ }
+
+ I.eraseFromParent();
+ }
+
return true;
}
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index e792b1b..812fa85 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -269,6 +269,7 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
getActionDefinitionsBuilder(G_ICMP)
.legalForCartesianProduct({s8}, Is64Bit ? IntTypes64 : IntTypes32)
.clampScalar(0, s8, s8)
+ .widenScalarToNextPow2(1, /*Min=*/8)
.clampScalar(1, s8, sMaxScalar);
// bswap
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 74de51c..0a98331 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -195,7 +195,7 @@ public:
bool padInstructionEncoding(MCFragment &RF, MCCodeEmitter &Emitter,
unsigned &RemainingSize) const;
- bool finishLayout(const MCAssembler &Asm) const override;
+ bool finishLayout() const override;
unsigned getMaximumNopSize(const MCSubtargetInfo &STI) const override;
@@ -850,7 +850,7 @@ bool X86AsmBackend::padInstructionEncoding(MCFragment &RF,
return Changed;
}
-bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
+bool X86AsmBackend::finishLayout() const {
// See if we can further relax some instructions to cut down on the number of
// nop bytes required for code alignment. The actual win is in reducing
// instruction count, not number of bytes. Modern X86-64 can easily end up
@@ -864,11 +864,11 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
// MCSymbols and therefore different relaxation results. X86PadForAlign is
// disabled by default to eliminate the -g vs non -g difference.
DenseSet<MCFragment *> LabeledFragments;
- for (const MCSymbol &S : Asm.symbols())
+ for (const MCSymbol &S : Asm->symbols())
LabeledFragments.insert(S.getFragment());
bool Changed = false;
- for (MCSection &Sec : Asm) {
+ for (MCSection &Sec : *Asm) {
if (!Sec.isText())
continue;
@@ -908,13 +908,13 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
// the align directive. This is purely about human understandability
// of the resulting code. If we later find a reason to expand
// particular instructions over others, we can adjust.
- unsigned RemainingSize = Asm.computeFragmentSize(F) - F.getFixedSize();
+ unsigned RemainingSize = Asm->computeFragmentSize(F) - F.getFixedSize();
while (!Relaxable.empty() && RemainingSize != 0) {
auto &RF = *Relaxable.pop_back_val();
// Give the backend a chance to play any tricks it wishes to increase
// the encoding size of the given instruction. Target independent code
// will try further relaxation, but target's may play further tricks.
- Changed |= padInstructionEncoding(RF, Asm.getEmitter(), RemainingSize);
+ Changed |= padInstructionEncoding(RF, Asm->getEmitter(), RemainingSize);
// If we have an instruction which hasn't been fully relaxed, we can't
// skip past it and insert bytes before it. Changing its starting
@@ -1391,7 +1391,7 @@ public:
return CU::UNWIND_MODE_DWARF;
MCRegister Reg = *MRI.getLLVMRegNum(Inst.getRegister(), true);
- SavedRegs[SavedRegIdx++] = Reg;
+ SavedRegs[SavedRegIdx++] = Reg.id();
StackAdjust += OffsetSize;
MinAbsOffset = std::min(MinAbsOffset, std::abs(Inst.getOffset()));
InstrOffset += PushInstrSize(Reg);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index 759d95e..88dd543 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -451,7 +451,7 @@ void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo,
// the assembly would look something like:
// "vp2intersect %zmm5, %zmm7, {%k2, %k3}"
// but this can work too.
- switch (MI->getOperand(OpNo).getReg()) {
+ switch (MI->getOperand(OpNo).getReg().id()) {
case X86::K0_K1:
printRegName(OS, X86::K0);
return;
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index af5a698..0c874b7 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -535,7 +535,7 @@ bool X86MCInstrAnalysis::clearsSuperRegisters(const MCRegisterInfo &MRI,
const MCRegisterClass &VR128XRC = MRI.getRegClass(X86::VR128XRegClassID);
const MCRegisterClass &VR256XRC = MRI.getRegClass(X86::VR256XRegClassID);
- auto ClearsSuperReg = [=](unsigned RegID) {
+ auto ClearsSuperReg = [=](MCRegister RegID) {
// On X86-64, a general purpose integer register is viewed as a 64-bit
// register internal to the processor.
// An update to the lower 32 bits of a 64 bit integer register is
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index 1ef10928..abbb0c2 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -65,7 +65,7 @@ void X86WinCOFFStreamer::emitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) {
}
void X86WinCOFFStreamer::finishImpl() {
- emitFrames(nullptr);
+ emitFrames();
emitWindowsUnwindTables();
MCWinCOFFStreamer::finishImpl();
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
index 9c44231..b722964 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
@@ -55,6 +55,7 @@ struct FPOInstruction {
StackAlign,
SetFrame,
} Op;
+ // FIXME: This should be a union of MCRegister and unsigned.
unsigned RegOrOffset;
};
@@ -215,7 +216,7 @@ bool X86WinCOFFTargetStreamer::emitFPOSetFrame(MCRegister Reg, SMLoc L) {
FPOInstruction Inst;
Inst.Label = emitFPOLabel();
Inst.Op = FPOInstruction::SetFrame;
- Inst.RegOrOffset = Reg;
+ Inst.RegOrOffset = Reg.id();
CurFPOData->Instructions.push_back(Inst);
return false;
}
@@ -226,7 +227,7 @@ bool X86WinCOFFTargetStreamer::emitFPOPushReg(MCRegister Reg, SMLoc L) {
FPOInstruction Inst;
Inst.Label = emitFPOLabel();
Inst.Op = FPOInstruction::PushReg;
- Inst.RegOrOffset = Reg;
+ Inst.RegOrOffset = Reg.id();
CurFPOData->Instructions.push_back(Inst);
return false;
}
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 51b540a..97848be 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -14,6 +14,7 @@
#ifndef LLVM_LIB_TARGET_X86_X86_H
#define LLVM_LIB_TARGET_X86_X86_H
+#include "llvm/CodeGen/MachineFunctionAnalysisManager.h"
#include "llvm/IR/Analysis.h"
#include "llvm/IR/PassManager.h"
#include "llvm/Support/CodeGen.h"
@@ -43,7 +44,13 @@ FunctionPass *createCleanupLocalDynamicTLSPass();
/// This function returns a pass which converts floating-point register
/// references and pseudo instructions into floating-point stack references and
/// physical instructions.
-FunctionPass *createX86FloatingPointStackifierPass();
+class X86FPStackifierPass : public PassInfoMixin<X86FPStackifierPass> {
+public:
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+};
+
+FunctionPass *createX86FPStackifierLegacyPass();
/// This pass inserts AVX vzeroupper instructions before each call to avoid
/// transition penalty between functions encoded with AVX and SSE.
@@ -83,7 +90,14 @@ FunctionPass *createX86AvoidStoreForwardingBlocks();
FunctionPass *createX86FlagsCopyLoweringPass();
/// Return a pass that expands DynAlloca pseudo-instructions.
-FunctionPass *createX86DynAllocaExpander();
+class X86DynAllocaExpanderPass
+ : public PassInfoMixin<X86DynAllocaExpanderPass> {
+public:
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+};
+
+FunctionPass *createX86DynAllocaExpanderLegacyPass();
/// Return a pass that config the tile registers.
FunctionPass *createX86TileConfigPass();
@@ -104,7 +118,15 @@ FunctionPass *createX86LowerTileCopyPass();
/// CALL instruction. The pass does the same for each funclet as well. This
/// ensures that the open interval of function start and end PCs contains all
/// return addresses for the benefit of the Windows x64 unwinder.
-FunctionPass *createX86AvoidTrailingCallPass();
+class X86AvoidTrailingCallPass
+ : public PassInfoMixin<X86AvoidTrailingCallPass> {
+public:
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+ static bool isRequired() { return true; }
+};
+
+FunctionPass *createX86AvoidTrailingCallLegacyPass();
/// Return a pass that optimizes the code-size of x86 call sequences. This is
/// done by replacing esp-relative movs with pushes.
@@ -144,13 +166,6 @@ FunctionPass *createX86IndirectThunksPass();
/// This pass replaces ret instructions with jmp's to __x86_return thunk.
FunctionPass *createX86ReturnThunksPass();
-/// This pass ensures instructions featuring a memory operand
-/// have distinctive <LineNumber, Discriminator> (with respect to each other)
-FunctionPass *createX86DiscriminateMemOpsPass();
-
-/// This pass applies profiling information to insert cache prefetches.
-FunctionPass *createX86InsertPrefetchPass();
-
/// This pass insert wait instruction after X87 instructions which could raise
/// fp exceptions when strict-fp enabled.
FunctionPass *createX86InsertX87waitPass();
@@ -158,7 +173,16 @@ FunctionPass *createX86InsertX87waitPass();
/// This pass optimizes arithmetic based on knowledge that is only used by
/// a reduction sequence and is therefore safe to reassociate in interesting
/// ways.
-FunctionPass *createX86PartialReductionPass();
+class X86PartialReductionPass : public PassInfoMixin<X86PartialReductionPass> {
+private:
+ const X86TargetMachine *TM;
+
+public:
+ X86PartialReductionPass(const X86TargetMachine *TM) : TM(TM) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+};
+
+FunctionPass *createX86PartialReductionLegacyPass();
/// // Analyzes and emits pseudos to support Win x64 Unwind V2.
FunctionPass *createX86WinEHUnwindV2Pass();
@@ -179,7 +203,18 @@ FunctionPass *createX86LowerAMXTypeLegacyPass();
/// The pass transforms amx intrinsics to scalar operation if the function has
/// optnone attribute or it is O0.
-FunctionPass *createX86LowerAMXIntrinsicsPass();
+class X86LowerAMXIntrinsicsPass
+ : public PassInfoMixin<X86LowerAMXIntrinsicsPass> {
+private:
+ const TargetMachine *TM;
+
+public:
+ X86LowerAMXIntrinsicsPass(const TargetMachine *TM) : TM(TM) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+ static bool isRequired() { return true; }
+};
+
+FunctionPass *createX86LowerAMXIntrinsicsLegacyPass();
InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
const X86Subtarget &,
@@ -193,7 +228,6 @@ FunctionPass *createX86ArgumentStackSlotPass();
FunctionPass *createX86SuppressAPXForRelocationPass();
void initializeCompressEVEXPassPass(PassRegistry &);
-void initializeFPSPass(PassRegistry &);
void initializeFixupBWInstPassPass(PassRegistry &);
void initializeFixupLEAPassPass(PassRegistry &);
void initializeX86ArgumentStackSlotPassPass(PassRegistry &);
@@ -202,14 +236,15 @@ void initializeX86FixupInstTuningPassPass(PassRegistry &);
void initializeX86FixupVectorConstantsPassPass(PassRegistry &);
void initializeWinEHStatePassPass(PassRegistry &);
void initializeX86AvoidSFBPassPass(PassRegistry &);
-void initializeX86AvoidTrailingCallPassPass(PassRegistry &);
+void initializeX86AvoidTrailingCallLegacyPassPass(PassRegistry &);
void initializeX86CallFrameOptimizationPass(PassRegistry &);
void initializeX86CmovConverterPassPass(PassRegistry &);
void initializeX86DAGToDAGISelLegacyPass(PassRegistry &);
void initializeX86DomainReassignmentPass(PassRegistry &);
-void initializeX86DynAllocaExpanderPass(PassRegistry &);
+void initializeX86DynAllocaExpanderLegacyPass(PassRegistry &);
void initializeX86ExecutionDomainFixPass(PassRegistry &);
void initializeX86ExpandPseudoPass(PassRegistry &);
+void initializeX86FPStackifierLegacyPass(PassRegistry &);
void initializeX86FastPreTileConfigPass(PassRegistry &);
void initializeX86FastTileConfigPass(PassRegistry &);
void initializeX86FixupSetCCPassPass(PassRegistry &);
@@ -220,7 +255,7 @@ void initializeX86LowerAMXIntrinsicsLegacyPassPass(PassRegistry &);
void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
void initializeX86LowerTileCopyPass(PassRegistry &);
void initializeX86OptimizeLEAPassPass(PassRegistry &);
-void initializeX86PartialReductionPass(PassRegistry &);
+void initializeX86PartialReductionLegacyPass(PassRegistry &);
void initializeX86PreTileConfigPass(PassRegistry &);
void initializeX86ReturnThunksPass(PassRegistry &);
void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 9e291a6..8f29a64 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -795,6 +795,8 @@ include "X86Schedule.td"
include "X86InstrInfo.td"
include "X86SchedPredicates.td"
+defm : RemapAllTargetPseudoPointerOperands<x86_ptr_rc>;
+
def X86InstrInfo : InstrInfo;
//===----------------------------------------------------------------------===//
@@ -1334,8 +1336,18 @@ def ProcessorFeatures {
!listremove(ARLSFeatures, [FeatureWIDEKL]);
// Novalake
+ list<SubtargetFeature> NVLAdditionalFeatures = [FeatureAVX10_2,
+ FeatureMOVRS,
+ FeatureEGPR,
+ FeaturePush2Pop2,
+ FeaturePPX,
+ FeatureNF,
+ FeatureNDD,
+ FeatureZU,
+ FeatureCCMP,
+ FeaturePREFETCHI];
list<SubtargetFeature> NVLFeatures =
- !listconcat(PTLFeatures, [FeaturePREFETCHI]);
+ !listconcat(PTLFeatures, NVLAdditionalFeatures);
// Clearwaterforest
list<SubtargetFeature> CWFAdditionalFeatures = [FeaturePREFETCHI,
diff --git a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
index d2e3527..9473e8d 100644
--- a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
+++ b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -387,8 +387,8 @@ void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
MachineMemOperand *LMMO = *LoadInst->memoperands_begin();
MachineMemOperand *SMMO = *StoreInst->memoperands_begin();
- Register Reg1 = MRI->createVirtualRegister(
- TII->getRegClass(TII->get(NLoadOpcode), 0, TRI));
+ Register Reg1 =
+ MRI->createVirtualRegister(TII->getRegClass(TII->get(NLoadOpcode), 0));
MachineInstr *NewLoad =
BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode),
Reg1)
@@ -553,7 +553,7 @@ void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
}
unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) {
- const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI);
+ const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0);
return TRI->getRegSizeInBits(*TRC) / 8;
}
diff --git a/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp b/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp
index 2ecf493..ebd4284 100644
--- a/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp
+++ b/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp
@@ -37,6 +37,8 @@
#include "X86Subtarget.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Analysis.h"
+#include "llvm/IR/PassManager.h"
#define AVOIDCALL_DESC "X86 avoid trailing call pass"
#define AVOIDCALL_NAME "x86-avoid-trailing-call"
@@ -46,9 +48,9 @@
using namespace llvm;
namespace {
-class X86AvoidTrailingCallPass : public MachineFunctionPass {
+class X86AvoidTrailingCallLegacyPass : public MachineFunctionPass {
public:
- X86AvoidTrailingCallPass() : MachineFunctionPass(ID) {}
+ X86AvoidTrailingCallLegacyPass() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -59,13 +61,14 @@ private:
};
} // end anonymous namespace
-char X86AvoidTrailingCallPass::ID = 0;
+char X86AvoidTrailingCallLegacyPass::ID = 0;
-FunctionPass *llvm::createX86AvoidTrailingCallPass() {
- return new X86AvoidTrailingCallPass();
+FunctionPass *llvm::createX86AvoidTrailingCallLegacyPass() {
+ return new X86AvoidTrailingCallLegacyPass();
}
-INITIALIZE_PASS(X86AvoidTrailingCallPass, AVOIDCALL_NAME, AVOIDCALL_DESC, false, false)
+INITIALIZE_PASS(X86AvoidTrailingCallLegacyPass, AVOIDCALL_NAME, AVOIDCALL_DESC,
+ false, false)
// A real instruction is a non-meta, non-pseudo instruction. Some pseudos
// expand to nothing, and some expand to code. This logic conservatively assumes
@@ -79,7 +82,7 @@ static bool isCallInstruction(const MachineInstr &MI) {
return MI.isCall() && !MI.isReturn();
}
-bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) {
+bool UpdatedOnX86AvoidTrailingCallPass(MachineFunction &MF) {
const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
const X86InstrInfo &TII = *STI.getInstrInfo();
assert(STI.isTargetWin64() && "pass only runs on Win64");
@@ -134,3 +137,19 @@ bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) {
return Changed;
}
+
+bool X86AvoidTrailingCallLegacyPass::runOnMachineFunction(MachineFunction &MF) {
+ return UpdatedOnX86AvoidTrailingCallPass(MF);
+}
+
+PreservedAnalyses
+X86AvoidTrailingCallPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ bool Changed = UpdatedOnX86AvoidTrailingCallPass(MF);
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA = PreservedAnalyses::none();
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index c0c7f5a..0f55c19 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -15,6 +15,7 @@
// c. NDD (EVEX) -> non-NDD (legacy)
// d. NF_ND (EVEX) -> NF (EVEX)
// e. NonNF (EVEX) -> NF (EVEX)
+// f. SETZUCCm (EVEX) -> SETCCm (legacy)
//
// Compression a, b and c can always reduce code size, with some exceptions
// such as promoted 16-bit CRC32 which is as long as the legacy version.
@@ -216,14 +217,15 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB,
// memory form: broadcast
//
// APX:
- // MAP4: NDD
+ // MAP4: NDD, ZU
//
// For AVX512 cases, EVEX prefix is needed in order to carry this information
// thus preventing the transformation to VEX encoding.
bool IsND = X86II::hasNewDataDest(TSFlags);
- if (TSFlags & X86II::EVEX_B && !IsND)
- return false;
unsigned Opc = MI.getOpcode();
+ bool IsSetZUCCm = Opc == X86::SETZUCCm;
+ if (TSFlags & X86II::EVEX_B && !IsND && !IsSetZUCCm)
+ return false;
// MOVBE*rr is special because it has semantic of NDD but not set EVEX_B.
bool IsNDLike = IsND || Opc == X86::MOVBE32rr || Opc == X86::MOVBE64rr;
bool IsRedundantNDD = IsNDLike ? IsRedundantNewDataDest(Opc) : false;
@@ -272,7 +274,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB,
const MachineOperand &Src2 = MI.getOperand(2);
bool Is32BitReg = Opc == X86::ADD32ri_ND || Opc == X86::ADD32rr_ND;
const MCInstrDesc &NewDesc =
- ST.getInstrInfo()->get(Is32BitReg ? X86::LEA32r : X86::LEA64r);
+ ST.getInstrInfo()->get(Is32BitReg ? X86::LEA64_32r : X86::LEA64r);
if (Is32BitReg)
Src1 = getX86SubSuperRegister(Src1, 64);
MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), NewDesc, Dst)
@@ -339,7 +341,7 @@ bool CompressEVEXPass::runOnMachineFunction(MachineFunction &MF) {
}
#endif
const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
- if (!ST.hasAVX512() && !ST.hasEGPR() && !ST.hasNDD())
+ if (!ST.hasAVX512() && !ST.hasEGPR() && !ST.hasNDD() && !ST.hasZU())
return false;
bool Changed = false;
diff --git a/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp b/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
deleted file mode 100644
index bd151a4..0000000
--- a/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-//===- X86DiscriminateMemOps.cpp - Unique IDs for Mem Ops -----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// This pass aids profile-driven cache prefetch insertion by ensuring all
-/// instructions that have a memory operand are distinguishible from each other.
-///
-//===----------------------------------------------------------------------===//
-
-#include "X86.h"
-#include "X86Subtarget.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/ProfileData/SampleProf.h"
-#include "llvm/ProfileData/SampleProfReader.h"
-#include "llvm/Support/Debug.h"
-#include <optional>
-using namespace llvm;
-
-#define DEBUG_TYPE "x86-discriminate-memops"
-
-static cl::opt<bool> EnableDiscriminateMemops(
- DEBUG_TYPE, cl::init(false),
- cl::desc("Generate unique debug info for each instruction with a memory "
- "operand. Should be enabled for profile-driven cache prefetching, "
- "both in the build of the binary being profiled, as well as in "
- "the build of the binary consuming the profile."),
- cl::Hidden);
-
-static cl::opt<bool> BypassPrefetchInstructions(
- "x86-bypass-prefetch-instructions", cl::init(true),
- cl::desc("When discriminating instructions with memory operands, ignore "
- "prefetch instructions. This ensures the other memory operand "
- "instructions have the same identifiers after inserting "
- "prefetches, allowing for successive insertions."),
- cl::Hidden);
-
-namespace {
-
-using Location = std::pair<StringRef, unsigned>;
-
-Location diToLocation(const DILocation *Loc) {
- return std::make_pair(Loc->getFilename(), Loc->getLine());
-}
-
-/// Ensure each instruction having a memory operand has a distinct <LineNumber,
-/// Discriminator> pair.
-void updateDebugInfo(MachineInstr *MI, const DILocation *Loc) {
- DebugLoc DL(Loc);
- MI->setDebugLoc(DL);
-}
-
-class X86DiscriminateMemOps : public MachineFunctionPass {
- bool runOnMachineFunction(MachineFunction &MF) override;
- StringRef getPassName() const override {
- return "X86 Discriminate Memory Operands";
- }
-
-public:
- static char ID;
-
- /// Default construct and initialize the pass.
- X86DiscriminateMemOps();
-};
-
-bool IsPrefetchOpcode(unsigned Opcode) {
- return Opcode == X86::PREFETCHNTA || Opcode == X86::PREFETCHT0 ||
- Opcode == X86::PREFETCHT1 || Opcode == X86::PREFETCHT2 ||
- Opcode == X86::PREFETCHIT0 || Opcode == X86::PREFETCHIT1 ||
- Opcode == X86::PREFETCHRST2;
-}
-} // end anonymous namespace
-
-//===----------------------------------------------------------------------===//
-// Implementation
-//===----------------------------------------------------------------------===//
-
-char X86DiscriminateMemOps::ID = 0;
-
-/// Default construct and initialize the pass.
-X86DiscriminateMemOps::X86DiscriminateMemOps() : MachineFunctionPass(ID) {}
-
-bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) {
- if (!EnableDiscriminateMemops)
- return false;
-
- DISubprogram *FDI = MF.getFunction().getSubprogram();
- if (!FDI || !FDI->getUnit()->getDebugInfoForProfiling())
- return false;
-
- // Have a default DILocation, if we find instructions with memops that don't
- // have any debug info.
- const DILocation *ReferenceDI =
- DILocation::get(FDI->getContext(), FDI->getLine(), 0, FDI);
- assert(ReferenceDI && "ReferenceDI should not be nullptr");
- DenseMap<Location, unsigned> MemOpDiscriminators;
- MemOpDiscriminators[diToLocation(ReferenceDI)] = 0;
-
- // Figure out the largest discriminator issued for each Location. When we
- // issue new discriminators, we can thus avoid issuing discriminators
- // belonging to instructions that don't have memops. This isn't a requirement
- // for the goals of this pass, however, it avoids unnecessary ambiguity.
- for (auto &MBB : MF) {
- for (auto &MI : MBB) {
- const auto &DI = MI.getDebugLoc();
- if (!DI)
- continue;
- if (BypassPrefetchInstructions && IsPrefetchOpcode(MI.getDesc().Opcode))
- continue;
- Location Loc = diToLocation(DI);
- unsigned &Disc = MemOpDiscriminators[Loc];
- Disc = std::max(Disc, DI->getBaseDiscriminator());
- }
- }
-
- // Keep track of the discriminators seen at each Location. If an instruction's
- // DebugInfo has a Location and discriminator we've already seen, replace its
- // discriminator with a new one, to guarantee uniqueness.
- DenseMap<Location, DenseSet<unsigned>> Seen;
-
- bool Changed = false;
- for (auto &MBB : MF) {
- for (auto &MI : MBB) {
- if (X86II::getMemoryOperandNo(MI.getDesc().TSFlags) < 0)
- continue;
- if (BypassPrefetchInstructions && IsPrefetchOpcode(MI.getDesc().Opcode))
- continue;
- const DILocation *DI = MI.getDebugLoc();
- bool HasDebug = DI;
- if (!HasDebug) {
- DI = ReferenceDI;
- }
- Location L = diToLocation(DI);
- DenseSet<unsigned> &Set = Seen[L];
- const std::pair<DenseSet<unsigned>::iterator, bool> TryInsert =
- Set.insert(DI->getBaseDiscriminator());
- if (!TryInsert.second || !HasDebug) {
- unsigned BF, DF, CI = 0;
- DILocation::decodeDiscriminator(DI->getDiscriminator(), BF, DF, CI);
- std::optional<unsigned> EncodedDiscriminator =
- DILocation::encodeDiscriminator(MemOpDiscriminators[L] + 1, DF, CI);
-
- if (!EncodedDiscriminator) {
- // FIXME(mtrofin): The assumption is that this scenario is infrequent/OK
- // not to support. If evidence points otherwise, we can explore synthesizeing
- // unique DIs by adding fake line numbers, or by constructing 64 bit
- // discriminators.
- LLVM_DEBUG(dbgs() << "Unable to create a unique discriminator "
- "for instruction with memory operand in: "
- << DI->getFilename() << " Line: " << DI->getLine()
- << " Column: " << DI->getColumn()
- << ". This is likely due to a large macro expansion. \n");
- continue;
- }
- // Since we were able to encode, bump the MemOpDiscriminators.
- ++MemOpDiscriminators[L];
- DI = DI->cloneWithDiscriminator(*EncodedDiscriminator);
- assert(DI && "DI should not be nullptr");
- updateDebugInfo(&MI, DI);
- Changed = true;
- std::pair<DenseSet<unsigned>::iterator, bool> MustInsert =
- Set.insert(DI->getBaseDiscriminator());
- (void)MustInsert; // Silence warning in release build.
- assert(MustInsert.second && "New discriminator shouldn't be present in set");
- }
-
- // Bump the reference DI to avoid cramming discriminators on line 0.
- // FIXME(mtrofin): pin ReferenceDI on blocks or first instruction with DI
- // in a block. It's more consistent than just relying on the last memop
- // instruction we happened to see.
- ReferenceDI = DI;
- }
- }
- return Changed;
-}
-
-FunctionPass *llvm::createX86DiscriminateMemOpsPass() {
- return new X86DiscriminateMemOps();
-}
diff --git a/llvm/lib/Target/X86/X86DomainReassignment.cpp b/llvm/lib/Target/X86/X86DomainReassignment.cpp
index 5d19011..2047a53 100644
--- a/llvm/lib/Target/X86/X86DomainReassignment.cpp
+++ b/llvm/lib/Target/X86/X86DomainReassignment.cpp
@@ -174,8 +174,8 @@ public:
MachineBasicBlock *MBB = MI->getParent();
const DebugLoc &DL = MI->getDebugLoc();
- Register Reg = MRI->createVirtualRegister(
- TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo()));
+ Register Reg =
+ MRI->createVirtualRegister(TII->getRegClass(TII->get(DstOpcode), 0));
MachineInstrBuilder Bld = BuildMI(*MBB, MI, DL, TII->get(DstOpcode), Reg);
for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
Bld.add(MO);
diff --git a/llvm/lib/Target/X86/X86DynAllocaExpander.cpp b/llvm/lib/Target/X86/X86DynAllocaExpander.cpp
index c2a06ef..10f46f7 100644
--- a/llvm/lib/Target/X86/X86DynAllocaExpander.cpp
+++ b/llvm/lib/Target/X86/X86DynAllocaExpander.cpp
@@ -20,22 +20,22 @@
#include "X86Subtarget.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineFunctionAnalysisManager.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/IR/Analysis.h"
#include "llvm/IR/Function.h"
using namespace llvm;
namespace {
-class X86DynAllocaExpander : public MachineFunctionPass {
+class X86DynAllocaExpander {
public:
- X86DynAllocaExpander() : MachineFunctionPass(ID) {}
-
- bool runOnMachineFunction(MachineFunction &MF) override;
+ bool run(MachineFunction &MF);
private:
/// Strategies for lowering a DynAlloca.
@@ -61,22 +61,30 @@ private:
unsigned SlotSize = 0;
int64_t StackProbeSize = 0;
bool NoStackArgProbe = false;
+};
+
+class X86DynAllocaExpanderLegacy : public MachineFunctionPass {
+public:
+ X86DynAllocaExpanderLegacy() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+private:
StringRef getPassName() const override { return "X86 DynAlloca Expander"; }
public:
static char ID;
};
-char X86DynAllocaExpander::ID = 0;
+char X86DynAllocaExpanderLegacy::ID = 0;
} // end anonymous namespace
-INITIALIZE_PASS(X86DynAllocaExpander, "x86-dyn-alloca-expander",
+INITIALIZE_PASS(X86DynAllocaExpanderLegacy, "x86-dyn-alloca-expander",
"X86 DynAlloca Expander", false, false)
-FunctionPass *llvm::createX86DynAllocaExpander() {
- return new X86DynAllocaExpander();
+FunctionPass *llvm::createX86DynAllocaExpanderLegacyPass() {
+ return new X86DynAllocaExpanderLegacy();
}
/// Return the allocation amount for a DynAlloca instruction, or -1 if unknown.
@@ -277,7 +285,7 @@ void X86DynAllocaExpander::lower(MachineInstr *MI, Lowering L) {
AmountDef->eraseFromParent();
}
-bool X86DynAllocaExpander::runOnMachineFunction(MachineFunction &MF) {
+bool X86DynAllocaExpander::run(MachineFunction &MF) {
if (!MF.getInfo<X86MachineFunctionInfo>()->hasDynAlloca())
return false;
@@ -299,3 +307,19 @@ bool X86DynAllocaExpander::runOnMachineFunction(MachineFunction &MF) {
return true;
}
+
+bool X86DynAllocaExpanderLegacy::runOnMachineFunction(MachineFunction &MF) {
+ return X86DynAllocaExpander().run(MF);
+}
+
+PreservedAnalyses
+X86DynAllocaExpanderPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ bool Changed = X86DynAllocaExpander().run(MF);
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA = PreservedAnalyses::none();
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index e3c44c0..6a18086 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -608,40 +608,40 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
break;
case X86::PTCVTROWD2PSrreV:
- Opc = X86::TCVTROWD2PSrre;
+ Opc = X86::TCVTROWD2PSrte;
break;
case X86::PTCVTROWD2PSrriV:
- Opc = X86::TCVTROWD2PSrri;
+ Opc = X86::TCVTROWD2PSrti;
break;
case X86::PTCVTROWPS2BF16HrreV:
- Opc = X86::TCVTROWPS2BF16Hrre;
+ Opc = X86::TCVTROWPS2BF16Hrte;
break;
case X86::PTCVTROWPS2BF16HrriV:
- Opc = X86::TCVTROWPS2BF16Hrri;
+ Opc = X86::TCVTROWPS2BF16Hrti;
break;
case X86::PTCVTROWPS2BF16LrreV:
- Opc = X86::TCVTROWPS2BF16Lrre;
+ Opc = X86::TCVTROWPS2BF16Lrte;
break;
case X86::PTCVTROWPS2BF16LrriV:
- Opc = X86::TCVTROWPS2BF16Lrri;
+ Opc = X86::TCVTROWPS2BF16Lrti;
break;
case X86::PTCVTROWPS2PHHrreV:
- Opc = X86::TCVTROWPS2PHHrre;
+ Opc = X86::TCVTROWPS2PHHrte;
break;
case X86::PTCVTROWPS2PHHrriV:
- Opc = X86::TCVTROWPS2PHHrri;
+ Opc = X86::TCVTROWPS2PHHrti;
break;
case X86::PTCVTROWPS2PHLrreV:
- Opc = X86::TCVTROWPS2PHLrre;
+ Opc = X86::TCVTROWPS2PHLrte;
break;
case X86::PTCVTROWPS2PHLrriV:
- Opc = X86::TCVTROWPS2PHLrri;
+ Opc = X86::TCVTROWPS2PHLrti;
break;
case X86::PTILEMOVROWrreV:
- Opc = X86::TILEMOVROWrre;
+ Opc = X86::TILEMOVROWrte;
break;
case X86::PTILEMOVROWrriV:
- Opc = X86::TILEMOVROWrri;
+ Opc = X86::TILEMOVROWrti;
break;
default:
llvm_unreachable("Unexpected Opcode");
diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
index 06f729a..25799f4 100644
--- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
@@ -206,8 +206,7 @@ void X86FastPreTileConfig::spill(MachineBasicBlock::iterator Before,
const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
// Don't need shape information for tile store, becasue it is adjacent to
// the tile def instruction.
- TII->storeRegToStackSlot(*MBB, Before, VirtReg, Kill, FI, &RC, TRI,
- Register());
+ TII->storeRegToStackSlot(*MBB, Before, VirtReg, Kill, FI, &RC, Register());
++NumStores;
// TODO: update DBG_VALUEs
diff --git a/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
index 6274cb4..6e0a0f6 100644
--- a/llvm/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -202,7 +202,8 @@ Register FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI) const {
MCRegUnitIterator I = Range.begin(), E = Range.end();
for (MCRegUnit S : TRI->regunits(SuperDestReg)) {
I = std::lower_bound(I, E, S);
- if ((I == E || *I > S) && LiveUnits.getBitVector().test(S)) {
+ if ((I == E || *I > S) &&
+ LiveUnits.getBitVector().test(static_cast<unsigned>(S))) {
SuperIsLive = true;
break;
}
diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
index ab6e6d0..b3bf37a 100644
--- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -50,7 +50,6 @@
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
#include <cassert>
#include <iterator>
#include <utility>
diff --git a/llvm/lib/Target/X86/X86FloatingPoint.cpp b/llvm/lib/Target/X86/X86FloatingPoint.cpp
index 9f88fda..6af2050 100644
--- a/llvm/lib/Target/X86/X86FloatingPoint.cpp
+++ b/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -31,6 +31,8 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/EdgeBundles.h"
#include "llvm/CodeGen/LiveRegUnits.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysisManager.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -38,6 +40,7 @@
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/Analysis.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/Debug.h"
@@ -48,265 +51,272 @@
#include <bitset>
using namespace llvm;
-#define DEBUG_TYPE "x86-codegen"
+#define DEBUG_TYPE "x86-fp-stackifier"
STATISTIC(NumFXCH, "Number of fxch instructions inserted");
-STATISTIC(NumFP , "Number of floating point instructions");
+STATISTIC(NumFP, "Number of floating point instructions");
namespace {
- const unsigned ScratchFPReg = 7;
-
- struct FPS : public MachineFunctionPass {
- static char ID;
- FPS() : MachineFunctionPass(ID) {
- // This is really only to keep valgrind quiet.
- // The logic in isLive() is too much for it.
- memset(Stack, 0, sizeof(Stack));
- memset(RegMap, 0, sizeof(RegMap));
- }
+const unsigned ScratchFPReg = 7;
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<EdgeBundlesWrapperLegacy>();
- AU.addPreservedID(MachineLoopInfoID);
- AU.addPreservedID(MachineDominatorsID);
- MachineFunctionPass::getAnalysisUsage(AU);
- }
+class FPS {
+public:
+ bool shouldRun(MachineFunction &MF);
+ bool run(MachineFunction &MF, EdgeBundles *EdgeBundles);
- bool runOnMachineFunction(MachineFunction &MF) override;
+private:
+ const TargetInstrInfo *TII = nullptr; // Machine instruction info.
- MachineFunctionProperties getRequiredProperties() const override {
- return MachineFunctionProperties().setNoVRegs();
- }
+ // Two CFG edges are related if they leave the same block, or enter the same
+ // block. The transitive closure of an edge under this relation is a
+ // LiveBundle. It represents a set of CFG edges where the live FP stack
+ // registers must be allocated identically in the x87 stack.
+ //
+ // A LiveBundle is usually all the edges leaving a block, or all the edges
+ // entering a block, but it can contain more edges if critical edges are
+ // present.
+ //
+ // The set of live FP registers in a LiveBundle is calculated by bundleCFG,
+ // but the exact mapping of FP registers to stack slots is fixed later.
+ struct LiveBundle {
+ // Bit mask of live FP registers. Bit 0 = FP0, bit 1 = FP1, &c.
+ unsigned Mask = 0;
- StringRef getPassName() const override { return "X86 FP Stackifier"; }
+ // Number of pre-assigned live registers in FixStack. This is 0 when the
+ // stack order has not yet been fixed.
+ unsigned FixCount = 0;
- private:
- const TargetInstrInfo *TII = nullptr; // Machine instruction info.
+ // Assigned stack order for live-in registers.
+ // FixStack[i] == getStackEntry(i) for all i < FixCount.
+ unsigned char FixStack[8];
- // Two CFG edges are related if they leave the same block, or enter the same
- // block. The transitive closure of an edge under this relation is a
- // LiveBundle. It represents a set of CFG edges where the live FP stack
- // registers must be allocated identically in the x87 stack.
- //
- // A LiveBundle is usually all the edges leaving a block, or all the edges
- // entering a block, but it can contain more edges if critical edges are
- // present.
- //
- // The set of live FP registers in a LiveBundle is calculated by bundleCFG,
- // but the exact mapping of FP registers to stack slots is fixed later.
- struct LiveBundle {
- // Bit mask of live FP registers. Bit 0 = FP0, bit 1 = FP1, &c.
- unsigned Mask = 0;
-
- // Number of pre-assigned live registers in FixStack. This is 0 when the
- // stack order has not yet been fixed.
- unsigned FixCount = 0;
-
- // Assigned stack order for live-in registers.
- // FixStack[i] == getStackEntry(i) for all i < FixCount.
- unsigned char FixStack[8];
-
- LiveBundle() = default;
-
- // Have the live registers been assigned a stack order yet?
- bool isFixed() const { return !Mask || FixCount; }
- };
-
- // Numbered LiveBundle structs. LiveBundles[0] is used for all CFG edges
- // with no live FP registers.
- SmallVector<LiveBundle, 8> LiveBundles;
-
- // The edge bundle analysis provides indices into the LiveBundles vector.
- EdgeBundles *Bundles = nullptr;
-
- // Return a bitmask of FP registers in block's live-in list.
- static unsigned calcLiveInMask(MachineBasicBlock *MBB, bool RemoveFPs) {
- unsigned Mask = 0;
- for (MachineBasicBlock::livein_iterator I = MBB->livein_begin();
- I != MBB->livein_end(); ) {
- MCPhysReg Reg = I->PhysReg;
- static_assert(X86::FP6 - X86::FP0 == 6, "sequential regnums");
- if (Reg >= X86::FP0 && Reg <= X86::FP6) {
- Mask |= 1 << (Reg - X86::FP0);
- if (RemoveFPs) {
- I = MBB->removeLiveIn(I);
- continue;
- }
+ LiveBundle() = default;
+
+ // Have the live registers been assigned a stack order yet?
+ bool isFixed() const { return !Mask || FixCount; }
+ };
+
+ // Numbered LiveBundle structs. LiveBundles[0] is used for all CFG edges
+ // with no live FP registers.
+ SmallVector<LiveBundle, 8> LiveBundles;
+
+ // The edge bundle analysis provides indices into the LiveBundles vector.
+ EdgeBundles *Bundles = nullptr;
+
+ // Return a bitmask of FP registers in block's live-in list.
+ static unsigned calcLiveInMask(MachineBasicBlock *MBB, bool RemoveFPs) {
+ unsigned Mask = 0;
+ for (MachineBasicBlock::livein_iterator I = MBB->livein_begin();
+ I != MBB->livein_end();) {
+ MCPhysReg Reg = I->PhysReg;
+ static_assert(X86::FP6 - X86::FP0 == 6, "sequential regnums");
+ if (Reg >= X86::FP0 && Reg <= X86::FP6) {
+ Mask |= 1 << (Reg - X86::FP0);
+ if (RemoveFPs) {
+ I = MBB->removeLiveIn(I);
+ continue;
}
- ++I;
}
- return Mask;
+ ++I;
}
+ return Mask;
+ }
- // Partition all the CFG edges into LiveBundles.
- void bundleCFGRecomputeKillFlags(MachineFunction &MF);
+ // Partition all the CFG edges into LiveBundles.
+ void bundleCFGRecomputeKillFlags(MachineFunction &MF);
- MachineBasicBlock *MBB = nullptr; // Current basic block
+ MachineBasicBlock *MBB = nullptr; // Current basic block
- // The hardware keeps track of how many FP registers are live, so we have
- // to model that exactly. Usually, each live register corresponds to an
- // FP<n> register, but when dealing with calls, returns, and inline
- // assembly, it is sometimes necessary to have live scratch registers.
- unsigned Stack[8]; // FP<n> Registers in each stack slot...
- unsigned StackTop = 0; // The current top of the FP stack.
+ // The hardware keeps track of how many FP registers are live, so we have
+ // to model that exactly. Usually, each live register corresponds to an
+ // FP<n> register, but when dealing with calls, returns, and inline
+ // assembly, it is sometimes necessary to have live scratch registers.
+ unsigned Stack[8] = {}; // FP<n> Registers in each stack slot...
+ unsigned StackTop = 0; // The current top of the FP stack.
- enum {
- NumFPRegs = 8 // Including scratch pseudo-registers.
- };
+ enum {
+ NumFPRegs = 8 // Including scratch pseudo-registers.
+ };
- // For each live FP<n> register, point to its Stack[] entry.
- // The first entries correspond to FP0-FP6, the rest are scratch registers
- // used when we need slightly different live registers than what the
- // register allocator thinks.
- unsigned RegMap[NumFPRegs];
+ // For each live FP<n> register, point to its Stack[] entry.
+ // The first entries correspond to FP0-FP6, the rest are scratch registers
+ // used when we need slightly different live registers than what the
+ // register allocator thinks.
+ unsigned RegMap[NumFPRegs] = {};
- // Set up our stack model to match the incoming registers to MBB.
- void setupBlockStack();
+ // Set up our stack model to match the incoming registers to MBB.
+ void setupBlockStack();
- // Shuffle live registers to match the expectations of successor blocks.
- void finishBlockStack();
+ // Shuffle live registers to match the expectations of successor blocks.
+ void finishBlockStack();
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- void dumpStack() const {
- dbgs() << "Stack contents:";
- for (unsigned i = 0; i != StackTop; ++i) {
- dbgs() << " FP" << Stack[i];
- assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!");
- }
+ void dumpStack() const {
+ dbgs() << "Stack contents:";
+ for (unsigned i = 0; i != StackTop; ++i) {
+ dbgs() << " FP" << Stack[i];
+ assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!");
}
+ }
#endif
- /// getSlot - Return the stack slot number a particular register number is
- /// in.
- unsigned getSlot(unsigned RegNo) const {
- assert(RegNo < NumFPRegs && "Regno out of range!");
- return RegMap[RegNo];
- }
+ /// getSlot - Return the stack slot number a particular register number is
+ /// in.
+ unsigned getSlot(unsigned RegNo) const {
+ assert(RegNo < NumFPRegs && "Regno out of range!");
+ return RegMap[RegNo];
+ }
- /// isLive - Is RegNo currently live in the stack?
- bool isLive(unsigned RegNo) const {
- unsigned Slot = getSlot(RegNo);
- return Slot < StackTop && Stack[Slot] == RegNo;
- }
+ /// isLive - Is RegNo currently live in the stack?
+ bool isLive(unsigned RegNo) const {
+ unsigned Slot = getSlot(RegNo);
+ return Slot < StackTop && Stack[Slot] == RegNo;
+ }
- /// getStackEntry - Return the X86::FP<n> register in register ST(i).
- unsigned getStackEntry(unsigned STi) const {
- if (STi >= StackTop)
- report_fatal_error("Access past stack top!");
- return Stack[StackTop-1-STi];
- }
+ /// getStackEntry - Return the X86::FP<n> register in register ST(i).
+ unsigned getStackEntry(unsigned STi) const {
+ if (STi >= StackTop)
+ report_fatal_error("Access past stack top!");
+ return Stack[StackTop - 1 - STi];
+ }
- /// getSTReg - Return the X86::ST(i) register which contains the specified
- /// FP<RegNo> register.
- unsigned getSTReg(unsigned RegNo) const {
- return StackTop - 1 - getSlot(RegNo) + X86::ST0;
- }
+ /// getSTReg - Return the X86::ST(i) register which contains the specified
+ /// FP<RegNo> register.
+ unsigned getSTReg(unsigned RegNo) const {
+ return StackTop - 1 - getSlot(RegNo) + X86::ST0;
+ }
- // pushReg - Push the specified FP<n> register onto the stack.
- void pushReg(unsigned Reg) {
- assert(Reg < NumFPRegs && "Register number out of range!");
- if (StackTop >= 8)
- report_fatal_error("Stack overflow!");
- Stack[StackTop] = Reg;
- RegMap[Reg] = StackTop++;
- }
+ // pushReg - Push the specified FP<n> register onto the stack.
+ void pushReg(unsigned Reg) {
+ assert(Reg < NumFPRegs && "Register number out of range!");
+ if (StackTop >= 8)
+ report_fatal_error("Stack overflow!");
+ Stack[StackTop] = Reg;
+ RegMap[Reg] = StackTop++;
+ }
- // popReg - Pop a register from the stack.
- void popReg() {
- if (StackTop == 0)
- report_fatal_error("Cannot pop empty stack!");
- RegMap[Stack[--StackTop]] = ~0; // Update state
- }
+ // popReg - Pop a register from the stack.
+ void popReg() {
+ if (StackTop == 0)
+ report_fatal_error("Cannot pop empty stack!");
+ RegMap[Stack[--StackTop]] = ~0; // Update state
+ }
- bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; }
- void moveToTop(unsigned RegNo, MachineBasicBlock::iterator I) {
- DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
- if (isAtTop(RegNo)) return;
+ bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop - 1; }
+ void moveToTop(unsigned RegNo, MachineBasicBlock::iterator I) {
+ DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
+ if (isAtTop(RegNo))
+ return;
- unsigned STReg = getSTReg(RegNo);
- unsigned RegOnTop = getStackEntry(0);
+ unsigned STReg = getSTReg(RegNo);
+ unsigned RegOnTop = getStackEntry(0);
- // Swap the slots the regs are in.
- std::swap(RegMap[RegNo], RegMap[RegOnTop]);
+ // Swap the slots the regs are in.
+ std::swap(RegMap[RegNo], RegMap[RegOnTop]);
- // Swap stack slot contents.
- if (RegMap[RegOnTop] >= StackTop)
- report_fatal_error("Access past stack top!");
- std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]);
+ // Swap stack slot contents.
+ if (RegMap[RegOnTop] >= StackTop)
+ report_fatal_error("Access past stack top!");
+ std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop - 1]);
- // Emit an fxch to update the runtime processors version of the state.
- BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(STReg);
- ++NumFXCH;
- }
+ // Emit an fxch to update the runtime processors version of the state.
+ BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(STReg);
+ ++NumFXCH;
+ }
- void duplicateToTop(unsigned RegNo, unsigned AsReg,
- MachineBasicBlock::iterator I) {
- DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
- unsigned STReg = getSTReg(RegNo);
- pushReg(AsReg); // New register on top of stack
+ void duplicateToTop(unsigned RegNo, unsigned AsReg,
+ MachineBasicBlock::iterator I) {
+ DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
+ unsigned STReg = getSTReg(RegNo);
+ pushReg(AsReg); // New register on top of stack
- BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg);
- }
+ BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg);
+ }
- /// popStackAfter - Pop the current value off of the top of the FP stack
- /// after the specified instruction.
- void popStackAfter(MachineBasicBlock::iterator &I);
-
- /// freeStackSlotAfter - Free the specified register from the register
- /// stack, so that it is no longer in a register. If the register is
- /// currently at the top of the stack, we just pop the current instruction,
- /// otherwise we store the current top-of-stack into the specified slot,
- /// then pop the top of stack.
- void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg);
-
- /// freeStackSlotBefore - Just the pop, no folding. Return the inserted
- /// instruction.
- MachineBasicBlock::iterator
- freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo);
-
- /// Adjust the live registers to be the set in Mask.
- void adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I);
-
- /// Shuffle the top FixCount stack entries such that FP reg FixStack[0] is
- /// st(0), FP reg FixStack[1] is st(1) etc.
- void shuffleStackTop(const unsigned char *FixStack, unsigned FixCount,
- MachineBasicBlock::iterator I);
-
- bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
-
- void handleCall(MachineBasicBlock::iterator &I);
- void handleReturn(MachineBasicBlock::iterator &I);
- void handleZeroArgFP(MachineBasicBlock::iterator &I);
- void handleOneArgFP(MachineBasicBlock::iterator &I);
- void handleOneArgFPRW(MachineBasicBlock::iterator &I);
- void handleTwoArgFP(MachineBasicBlock::iterator &I);
- void handleCompareFP(MachineBasicBlock::iterator &I);
- void handleCondMovFP(MachineBasicBlock::iterator &I);
- void handleSpecialFP(MachineBasicBlock::iterator &I);
-
- // Check if a COPY instruction is using FP registers.
- static bool isFPCopy(MachineInstr &MI) {
- Register DstReg = MI.getOperand(0).getReg();
- Register SrcReg = MI.getOperand(1).getReg();
-
- return X86::RFP80RegClass.contains(DstReg) ||
- X86::RFP80RegClass.contains(SrcReg);
- }
+ /// popStackAfter - Pop the current value off of the top of the FP stack
+ /// after the specified instruction.
+ void popStackAfter(MachineBasicBlock::iterator &I);
+
+ /// freeStackSlotAfter - Free the specified register from the register
+ /// stack, so that it is no longer in a register. If the register is
+ /// currently at the top of the stack, we just pop the current instruction,
+ /// otherwise we store the current top-of-stack into the specified slot,
+ /// then pop the top of stack.
+ void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg);
+
+ /// freeStackSlotBefore - Just the pop, no folding. Return the inserted
+ /// instruction.
+ MachineBasicBlock::iterator freeStackSlotBefore(MachineBasicBlock::iterator I,
+ unsigned FPRegNo);
+
+ /// Adjust the live registers to be the set in Mask.
+ void adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I);
+
+ /// Shuffle the top FixCount stack entries such that FP reg FixStack[0] is
+ /// st(0), FP reg FixStack[1] is st(1) etc.
+ void shuffleStackTop(const unsigned char *FixStack, unsigned FixCount,
+ MachineBasicBlock::iterator I);
+
+ bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+ void handleCall(MachineBasicBlock::iterator &I);
+ void handleReturn(MachineBasicBlock::iterator &I);
+ void handleZeroArgFP(MachineBasicBlock::iterator &I);
+ void handleOneArgFP(MachineBasicBlock::iterator &I);
+ void handleOneArgFPRW(MachineBasicBlock::iterator &I);
+ void handleTwoArgFP(MachineBasicBlock::iterator &I);
+ void handleCompareFP(MachineBasicBlock::iterator &I);
+ void handleCondMovFP(MachineBasicBlock::iterator &I);
+ void handleSpecialFP(MachineBasicBlock::iterator &I);
+
+ // Check if a COPY instruction is using FP registers.
+ static bool isFPCopy(MachineInstr &MI) {
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+
+ return X86::RFP80RegClass.contains(DstReg) ||
+ X86::RFP80RegClass.contains(SrcReg);
+ }
- void setKillFlags(MachineBasicBlock &MBB) const;
- };
-}
+ void setKillFlags(MachineBasicBlock &MBB) const;
+};
-char FPS::ID = 0;
+class X86FPStackifierLegacy : public MachineFunctionPass {
+public:
+ X86FPStackifierLegacy() : MachineFunctionPass(ID) {}
-INITIALIZE_PASS_BEGIN(FPS, DEBUG_TYPE, "X86 FP Stackifier",
+ static char ID;
+
+private:
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<EdgeBundlesWrapperLegacy>();
+ AU.addPreservedID(MachineLoopInfoID);
+ AU.addPreservedID(MachineDominatorsID);
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().setNoVRegs();
+ }
+
+ StringRef getPassName() const override { return "X86 FP Stackifier"; }
+};
+} // namespace
+
+char X86FPStackifierLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86FPStackifierLegacy, DEBUG_TYPE, "X86 FP Stackifier",
false, false)
INITIALIZE_PASS_DEPENDENCY(EdgeBundlesWrapperLegacy)
-INITIALIZE_PASS_END(FPS, DEBUG_TYPE, "X86 FP Stackifier",
+INITIALIZE_PASS_END(X86FPStackifierLegacy, DEBUG_TYPE, "X86 FP Stackifier",
false, false)
-FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); }
+FunctionPass *llvm::createX86FPStackifierLegacyPass() {
+ return new X86FPStackifierLegacy();
+}
/// getFPReg - Return the X86::FPx register number for the specified operand.
/// For example, this returns 3 for X86::FP3.
@@ -317,26 +327,25 @@ static unsigned getFPReg(const MachineOperand &MO) {
return Reg - X86::FP0;
}
-/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP
-/// register references into FP stack references.
-///
-bool FPS::runOnMachineFunction(MachineFunction &MF) {
+bool FPS::shouldRun(MachineFunction &MF) {
// We only need to run this pass if there are any FP registers used in this
// function. If it is all integer, there is nothing for us to do!
- bool FPIsUsed = false;
-
- static_assert(X86::FP6 == X86::FP0+6, "Register enums aren't sorted right!");
+ static_assert(X86::FP6 == X86::FP0 + 6,
+ "Register enums aren't sorted right!");
const MachineRegisterInfo &MRI = MF.getRegInfo();
- for (unsigned i = 0; i <= 6; ++i)
- if (!MRI.reg_nodbg_empty(X86::FP0 + i)) {
- FPIsUsed = true;
- break;
+ for (unsigned I = 0; I <= 6; ++I)
+ if (!MRI.reg_nodbg_empty(X86::FP0 + I)) {
+ return true;
}
- // Early exit.
- if (!FPIsUsed) return false;
+ return false;
+}
- Bundles = &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
+/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP
+/// register references into FP stack references.
+///
+bool FPS::run(MachineFunction &MF, EdgeBundles *FunctionBundles) {
+ Bundles = FunctionBundles;
TII = MF.getSubtarget().getInstrInfo();
// Prepare cross-MBB liveness.
@@ -346,16 +355,17 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) {
// Process the function in depth first order so that we process at least one
// of the predecessors for every reachable block in the function.
- df_iterator_default_set<MachineBasicBlock*> Processed;
+ df_iterator_default_set<MachineBasicBlock *> Processed;
MachineBasicBlock *Entry = &MF.front();
LiveBundle &Bundle =
- LiveBundles[Bundles->getBundle(Entry->getNumber(), false)];
+ LiveBundles[Bundles->getBundle(Entry->getNumber(), false)];
// In regcall convention, some FP registers may not be passed through
// the stack, so they will need to be assigned to the stack first
if ((Entry->getParent()->getFunction().getCallingConv() ==
- CallingConv::X86_RegCall) && (Bundle.Mask && !Bundle.FixCount)) {
+ CallingConv::X86_RegCall) &&
+ (Bundle.Mask && !Bundle.FixCount)) {
// In the register calling convention, up to one FP argument could be
// saved in the first FP register.
// If bundle.mask is non-zero and Bundle.FixCount is zero, it means
@@ -363,7 +373,7 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) {
// The actual value is passed in FP0.
// Here we fix the stack and mark FP0 as pre-assigned register.
assert((Bundle.Mask & 0xFE) == 0 &&
- "Only FP0 could be passed as an argument");
+ "Only FP0 could be passed as an argument");
Bundle.FixCount = 1;
Bundle.FixStack[0] = 0;
}
@@ -450,13 +460,13 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
}
if (FPInstClass == X86II::NotFP)
- continue; // Efficiently ignore non-fp insts!
+ continue; // Efficiently ignore non-fp insts!
MachineInstr *PrevMI = nullptr;
if (I != BB.begin())
PrevMI = &*std::prev(I);
- ++NumFP; // Keep track of # of pseudo instrs
+ ++NumFP; // Keep track of # of pseudo instrs
LLVM_DEBUG(dbgs() << "\nFPInst:\t" << MI);
// Get dead variables list now because the MI pointer may be deleted as part
@@ -467,14 +477,29 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
DeadRegs.push_back(MO.getReg());
switch (FPInstClass) {
- case X86II::ZeroArgFP: handleZeroArgFP(I); break;
- case X86II::OneArgFP: handleOneArgFP(I); break; // fstp ST(0)
- case X86II::OneArgFPRW: handleOneArgFPRW(I); break; // ST(0) = fsqrt(ST(0))
- case X86II::TwoArgFP: handleTwoArgFP(I); break;
- case X86II::CompareFP: handleCompareFP(I); break;
- case X86II::CondMovFP: handleCondMovFP(I); break;
- case X86II::SpecialFP: handleSpecialFP(I); break;
- default: llvm_unreachable("Unknown FP Type!");
+ case X86II::ZeroArgFP:
+ handleZeroArgFP(I);
+ break;
+ case X86II::OneArgFP:
+ handleOneArgFP(I);
+ break; // fstp ST(0)
+ case X86II::OneArgFPRW:
+ handleOneArgFPRW(I);
+ break; // ST(0) = fsqrt(ST(0))
+ case X86II::TwoArgFP:
+ handleTwoArgFP(I);
+ break;
+ case X86II::CompareFP:
+ handleCompareFP(I);
+ break;
+ case X86II::CondMovFP:
+ handleCondMovFP(I);
+ break;
+ case X86II::SpecialFP:
+ handleSpecialFP(I);
+ break;
+ default:
+ llvm_unreachable("Unknown FP Type!");
}
// Check to see if any of the values defined by this instruction are dead
@@ -483,9 +508,9 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
// Check if Reg is live on the stack. An inline-asm register operand that
// is in the clobber list and marked dead might not be live on the stack.
static_assert(X86::FP7 - X86::FP0 == 7, "sequential FP regnumbers");
- if (Reg >= X86::FP0 && Reg <= X86::FP6 && isLive(Reg-X86::FP0)) {
+ if (Reg >= X86::FP0 && Reg <= X86::FP6 && isLive(Reg - X86::FP0)) {
LLVM_DEBUG(dbgs() << "Register FP#" << Reg - X86::FP0 << " is dead!\n");
- freeStackSlotAfter(I, Reg-X86::FP0);
+ freeStackSlotAfter(I, Reg - X86::FP0);
}
}
@@ -524,7 +549,7 @@ void FPS::setupBlockStack() {
StackTop = 0;
// Get the live-in bundle for MBB.
const LiveBundle &Bundle =
- LiveBundles[Bundles->getBundle(MBB->getNumber(), false)];
+ LiveBundles[Bundles->getBundle(MBB->getNumber(), false)];
if (!Bundle.Mask) {
LLVM_DEBUG(dbgs() << "Block has no FP live-ins.\n");
@@ -538,7 +563,7 @@ void FPS::setupBlockStack() {
for (unsigned i = Bundle.FixCount; i > 0; --i) {
LLVM_DEBUG(dbgs() << "Live-in st(" << (i - 1) << "): %fp"
<< unsigned(Bundle.FixStack[i - 1]) << '\n');
- pushReg(Bundle.FixStack[i-1]);
+ pushReg(Bundle.FixStack[i - 1]);
}
// Kill off unwanted live-ins. This can happen with a critical edge.
@@ -589,24 +614,23 @@ void FPS::finishBlockStack() {
}
}
-
//===----------------------------------------------------------------------===//
// Efficient Lookup Table Support
//===----------------------------------------------------------------------===//
namespace {
- struct TableEntry {
- uint16_t from;
- uint16_t to;
- bool operator<(const TableEntry &TE) const { return from < TE.from; }
- friend bool operator<(const TableEntry &TE, unsigned V) {
- return TE.from < V;
- }
- [[maybe_unused]] friend bool operator<(unsigned V, const TableEntry &TE) {
- return V < TE.from;
- }
- };
-}
+struct TableEntry {
+ uint16_t from;
+ uint16_t to;
+ bool operator<(const TableEntry &TE) const { return from < TE.from; }
+ friend bool operator<(const TableEntry &TE, unsigned V) {
+ return TE.from < V;
+ }
+ [[maybe_unused]] friend bool operator<(unsigned V, const TableEntry &TE) {
+ return V < TE.from;
+ }
+};
+} // namespace
static int Lookup(ArrayRef<TableEntry> Table, unsigned Opcode) {
const TableEntry *I = llvm::lower_bound(Table, Opcode);
@@ -638,168 +662,168 @@ static int Lookup(ArrayRef<TableEntry> Table, unsigned Opcode) {
// concrete X86 instruction which uses the register stack.
//
static const TableEntry OpcodeTable[] = {
- { X86::ABS_Fp32 , X86::ABS_F },
- { X86::ABS_Fp64 , X86::ABS_F },
- { X86::ABS_Fp80 , X86::ABS_F },
- { X86::ADD_Fp32m , X86::ADD_F32m },
- { X86::ADD_Fp64m , X86::ADD_F64m },
- { X86::ADD_Fp64m32 , X86::ADD_F32m },
- { X86::ADD_Fp80m32 , X86::ADD_F32m },
- { X86::ADD_Fp80m64 , X86::ADD_F64m },
- { X86::ADD_FpI16m32 , X86::ADD_FI16m },
- { X86::ADD_FpI16m64 , X86::ADD_FI16m },
- { X86::ADD_FpI16m80 , X86::ADD_FI16m },
- { X86::ADD_FpI32m32 , X86::ADD_FI32m },
- { X86::ADD_FpI32m64 , X86::ADD_FI32m },
- { X86::ADD_FpI32m80 , X86::ADD_FI32m },
- { X86::CHS_Fp32 , X86::CHS_F },
- { X86::CHS_Fp64 , X86::CHS_F },
- { X86::CHS_Fp80 , X86::CHS_F },
- { X86::CMOVBE_Fp32 , X86::CMOVBE_F },
- { X86::CMOVBE_Fp64 , X86::CMOVBE_F },
- { X86::CMOVBE_Fp80 , X86::CMOVBE_F },
- { X86::CMOVB_Fp32 , X86::CMOVB_F },
- { X86::CMOVB_Fp64 , X86::CMOVB_F },
- { X86::CMOVB_Fp80 , X86::CMOVB_F },
- { X86::CMOVE_Fp32 , X86::CMOVE_F },
- { X86::CMOVE_Fp64 , X86::CMOVE_F },
- { X86::CMOVE_Fp80 , X86::CMOVE_F },
- { X86::CMOVNBE_Fp32 , X86::CMOVNBE_F },
- { X86::CMOVNBE_Fp64 , X86::CMOVNBE_F },
- { X86::CMOVNBE_Fp80 , X86::CMOVNBE_F },
- { X86::CMOVNB_Fp32 , X86::CMOVNB_F },
- { X86::CMOVNB_Fp64 , X86::CMOVNB_F },
- { X86::CMOVNB_Fp80 , X86::CMOVNB_F },
- { X86::CMOVNE_Fp32 , X86::CMOVNE_F },
- { X86::CMOVNE_Fp64 , X86::CMOVNE_F },
- { X86::CMOVNE_Fp80 , X86::CMOVNE_F },
- { X86::CMOVNP_Fp32 , X86::CMOVNP_F },
- { X86::CMOVNP_Fp64 , X86::CMOVNP_F },
- { X86::CMOVNP_Fp80 , X86::CMOVNP_F },
- { X86::CMOVP_Fp32 , X86::CMOVP_F },
- { X86::CMOVP_Fp64 , X86::CMOVP_F },
- { X86::CMOVP_Fp80 , X86::CMOVP_F },
- { X86::COM_FpIr32 , X86::COM_FIr },
- { X86::COM_FpIr64 , X86::COM_FIr },
- { X86::COM_FpIr80 , X86::COM_FIr },
- { X86::COM_Fpr32 , X86::COM_FST0r },
- { X86::COM_Fpr64 , X86::COM_FST0r },
- { X86::COM_Fpr80 , X86::COM_FST0r },
- { X86::DIVR_Fp32m , X86::DIVR_F32m },
- { X86::DIVR_Fp64m , X86::DIVR_F64m },
- { X86::DIVR_Fp64m32 , X86::DIVR_F32m },
- { X86::DIVR_Fp80m32 , X86::DIVR_F32m },
- { X86::DIVR_Fp80m64 , X86::DIVR_F64m },
- { X86::DIVR_FpI16m32, X86::DIVR_FI16m},
- { X86::DIVR_FpI16m64, X86::DIVR_FI16m},
- { X86::DIVR_FpI16m80, X86::DIVR_FI16m},
- { X86::DIVR_FpI32m32, X86::DIVR_FI32m},
- { X86::DIVR_FpI32m64, X86::DIVR_FI32m},
- { X86::DIVR_FpI32m80, X86::DIVR_FI32m},
- { X86::DIV_Fp32m , X86::DIV_F32m },
- { X86::DIV_Fp64m , X86::DIV_F64m },
- { X86::DIV_Fp64m32 , X86::DIV_F32m },
- { X86::DIV_Fp80m32 , X86::DIV_F32m },
- { X86::DIV_Fp80m64 , X86::DIV_F64m },
- { X86::DIV_FpI16m32 , X86::DIV_FI16m },
- { X86::DIV_FpI16m64 , X86::DIV_FI16m },
- { X86::DIV_FpI16m80 , X86::DIV_FI16m },
- { X86::DIV_FpI32m32 , X86::DIV_FI32m },
- { X86::DIV_FpI32m64 , X86::DIV_FI32m },
- { X86::DIV_FpI32m80 , X86::DIV_FI32m },
- { X86::ILD_Fp16m32 , X86::ILD_F16m },
- { X86::ILD_Fp16m64 , X86::ILD_F16m },
- { X86::ILD_Fp16m80 , X86::ILD_F16m },
- { X86::ILD_Fp32m32 , X86::ILD_F32m },
- { X86::ILD_Fp32m64 , X86::ILD_F32m },
- { X86::ILD_Fp32m80 , X86::ILD_F32m },
- { X86::ILD_Fp64m32 , X86::ILD_F64m },
- { X86::ILD_Fp64m64 , X86::ILD_F64m },
- { X86::ILD_Fp64m80 , X86::ILD_F64m },
- { X86::ISTT_Fp16m32 , X86::ISTT_FP16m},
- { X86::ISTT_Fp16m64 , X86::ISTT_FP16m},
- { X86::ISTT_Fp16m80 , X86::ISTT_FP16m},
- { X86::ISTT_Fp32m32 , X86::ISTT_FP32m},
- { X86::ISTT_Fp32m64 , X86::ISTT_FP32m},
- { X86::ISTT_Fp32m80 , X86::ISTT_FP32m},
- { X86::ISTT_Fp64m32 , X86::ISTT_FP64m},
- { X86::ISTT_Fp64m64 , X86::ISTT_FP64m},
- { X86::ISTT_Fp64m80 , X86::ISTT_FP64m},
- { X86::IST_Fp16m32 , X86::IST_F16m },
- { X86::IST_Fp16m64 , X86::IST_F16m },
- { X86::IST_Fp16m80 , X86::IST_F16m },
- { X86::IST_Fp32m32 , X86::IST_F32m },
- { X86::IST_Fp32m64 , X86::IST_F32m },
- { X86::IST_Fp32m80 , X86::IST_F32m },
- { X86::IST_Fp64m32 , X86::IST_FP64m },
- { X86::IST_Fp64m64 , X86::IST_FP64m },
- { X86::IST_Fp64m80 , X86::IST_FP64m },
- { X86::LD_Fp032 , X86::LD_F0 },
- { X86::LD_Fp064 , X86::LD_F0 },
- { X86::LD_Fp080 , X86::LD_F0 },
- { X86::LD_Fp132 , X86::LD_F1 },
- { X86::LD_Fp164 , X86::LD_F1 },
- { X86::LD_Fp180 , X86::LD_F1 },
- { X86::LD_Fp32m , X86::LD_F32m },
- { X86::LD_Fp32m64 , X86::LD_F32m },
- { X86::LD_Fp32m80 , X86::LD_F32m },
- { X86::LD_Fp64m , X86::LD_F64m },
- { X86::LD_Fp64m80 , X86::LD_F64m },
- { X86::LD_Fp80m , X86::LD_F80m },
- { X86::MUL_Fp32m , X86::MUL_F32m },
- { X86::MUL_Fp64m , X86::MUL_F64m },
- { X86::MUL_Fp64m32 , X86::MUL_F32m },
- { X86::MUL_Fp80m32 , X86::MUL_F32m },
- { X86::MUL_Fp80m64 , X86::MUL_F64m },
- { X86::MUL_FpI16m32 , X86::MUL_FI16m },
- { X86::MUL_FpI16m64 , X86::MUL_FI16m },
- { X86::MUL_FpI16m80 , X86::MUL_FI16m },
- { X86::MUL_FpI32m32 , X86::MUL_FI32m },
- { X86::MUL_FpI32m64 , X86::MUL_FI32m },
- { X86::MUL_FpI32m80 , X86::MUL_FI32m },
- { X86::SQRT_Fp32 , X86::SQRT_F },
- { X86::SQRT_Fp64 , X86::SQRT_F },
- { X86::SQRT_Fp80 , X86::SQRT_F },
- { X86::ST_Fp32m , X86::ST_F32m },
- { X86::ST_Fp64m , X86::ST_F64m },
- { X86::ST_Fp64m32 , X86::ST_F32m },
- { X86::ST_Fp80m32 , X86::ST_F32m },
- { X86::ST_Fp80m64 , X86::ST_F64m },
- { X86::ST_FpP80m , X86::ST_FP80m },
- { X86::SUBR_Fp32m , X86::SUBR_F32m },
- { X86::SUBR_Fp64m , X86::SUBR_F64m },
- { X86::SUBR_Fp64m32 , X86::SUBR_F32m },
- { X86::SUBR_Fp80m32 , X86::SUBR_F32m },
- { X86::SUBR_Fp80m64 , X86::SUBR_F64m },
- { X86::SUBR_FpI16m32, X86::SUBR_FI16m},
- { X86::SUBR_FpI16m64, X86::SUBR_FI16m},
- { X86::SUBR_FpI16m80, X86::SUBR_FI16m},
- { X86::SUBR_FpI32m32, X86::SUBR_FI32m},
- { X86::SUBR_FpI32m64, X86::SUBR_FI32m},
- { X86::SUBR_FpI32m80, X86::SUBR_FI32m},
- { X86::SUB_Fp32m , X86::SUB_F32m },
- { X86::SUB_Fp64m , X86::SUB_F64m },
- { X86::SUB_Fp64m32 , X86::SUB_F32m },
- { X86::SUB_Fp80m32 , X86::SUB_F32m },
- { X86::SUB_Fp80m64 , X86::SUB_F64m },
- { X86::SUB_FpI16m32 , X86::SUB_FI16m },
- { X86::SUB_FpI16m64 , X86::SUB_FI16m },
- { X86::SUB_FpI16m80 , X86::SUB_FI16m },
- { X86::SUB_FpI32m32 , X86::SUB_FI32m },
- { X86::SUB_FpI32m64 , X86::SUB_FI32m },
- { X86::SUB_FpI32m80 , X86::SUB_FI32m },
- { X86::TST_Fp32 , X86::TST_F },
- { X86::TST_Fp64 , X86::TST_F },
- { X86::TST_Fp80 , X86::TST_F },
- { X86::UCOM_FpIr32 , X86::UCOM_FIr },
- { X86::UCOM_FpIr64 , X86::UCOM_FIr },
- { X86::UCOM_FpIr80 , X86::UCOM_FIr },
- { X86::UCOM_Fpr32 , X86::UCOM_Fr },
- { X86::UCOM_Fpr64 , X86::UCOM_Fr },
- { X86::UCOM_Fpr80 , X86::UCOM_Fr },
- { X86::XAM_Fp32 , X86::XAM_F },
- { X86::XAM_Fp64 , X86::XAM_F },
- { X86::XAM_Fp80 , X86::XAM_F },
+ {X86::ABS_Fp32, X86::ABS_F},
+ {X86::ABS_Fp64, X86::ABS_F},
+ {X86::ABS_Fp80, X86::ABS_F},
+ {X86::ADD_Fp32m, X86::ADD_F32m},
+ {X86::ADD_Fp64m, X86::ADD_F64m},
+ {X86::ADD_Fp64m32, X86::ADD_F32m},
+ {X86::ADD_Fp80m32, X86::ADD_F32m},
+ {X86::ADD_Fp80m64, X86::ADD_F64m},
+ {X86::ADD_FpI16m32, X86::ADD_FI16m},
+ {X86::ADD_FpI16m64, X86::ADD_FI16m},
+ {X86::ADD_FpI16m80, X86::ADD_FI16m},
+ {X86::ADD_FpI32m32, X86::ADD_FI32m},
+ {X86::ADD_FpI32m64, X86::ADD_FI32m},
+ {X86::ADD_FpI32m80, X86::ADD_FI32m},
+ {X86::CHS_Fp32, X86::CHS_F},
+ {X86::CHS_Fp64, X86::CHS_F},
+ {X86::CHS_Fp80, X86::CHS_F},
+ {X86::CMOVBE_Fp32, X86::CMOVBE_F},
+ {X86::CMOVBE_Fp64, X86::CMOVBE_F},
+ {X86::CMOVBE_Fp80, X86::CMOVBE_F},
+ {X86::CMOVB_Fp32, X86::CMOVB_F},
+ {X86::CMOVB_Fp64, X86::CMOVB_F},
+ {X86::CMOVB_Fp80, X86::CMOVB_F},
+ {X86::CMOVE_Fp32, X86::CMOVE_F},
+ {X86::CMOVE_Fp64, X86::CMOVE_F},
+ {X86::CMOVE_Fp80, X86::CMOVE_F},
+ {X86::CMOVNBE_Fp32, X86::CMOVNBE_F},
+ {X86::CMOVNBE_Fp64, X86::CMOVNBE_F},
+ {X86::CMOVNBE_Fp80, X86::CMOVNBE_F},
+ {X86::CMOVNB_Fp32, X86::CMOVNB_F},
+ {X86::CMOVNB_Fp64, X86::CMOVNB_F},
+ {X86::CMOVNB_Fp80, X86::CMOVNB_F},
+ {X86::CMOVNE_Fp32, X86::CMOVNE_F},
+ {X86::CMOVNE_Fp64, X86::CMOVNE_F},
+ {X86::CMOVNE_Fp80, X86::CMOVNE_F},
+ {X86::CMOVNP_Fp32, X86::CMOVNP_F},
+ {X86::CMOVNP_Fp64, X86::CMOVNP_F},
+ {X86::CMOVNP_Fp80, X86::CMOVNP_F},
+ {X86::CMOVP_Fp32, X86::CMOVP_F},
+ {X86::CMOVP_Fp64, X86::CMOVP_F},
+ {X86::CMOVP_Fp80, X86::CMOVP_F},
+ {X86::COM_FpIr32, X86::COM_FIr},
+ {X86::COM_FpIr64, X86::COM_FIr},
+ {X86::COM_FpIr80, X86::COM_FIr},
+ {X86::COM_Fpr32, X86::COM_FST0r},
+ {X86::COM_Fpr64, X86::COM_FST0r},
+ {X86::COM_Fpr80, X86::COM_FST0r},
+ {X86::DIVR_Fp32m, X86::DIVR_F32m},
+ {X86::DIVR_Fp64m, X86::DIVR_F64m},
+ {X86::DIVR_Fp64m32, X86::DIVR_F32m},
+ {X86::DIVR_Fp80m32, X86::DIVR_F32m},
+ {X86::DIVR_Fp80m64, X86::DIVR_F64m},
+ {X86::DIVR_FpI16m32, X86::DIVR_FI16m},
+ {X86::DIVR_FpI16m64, X86::DIVR_FI16m},
+ {X86::DIVR_FpI16m80, X86::DIVR_FI16m},
+ {X86::DIVR_FpI32m32, X86::DIVR_FI32m},
+ {X86::DIVR_FpI32m64, X86::DIVR_FI32m},
+ {X86::DIVR_FpI32m80, X86::DIVR_FI32m},
+ {X86::DIV_Fp32m, X86::DIV_F32m},
+ {X86::DIV_Fp64m, X86::DIV_F64m},
+ {X86::DIV_Fp64m32, X86::DIV_F32m},
+ {X86::DIV_Fp80m32, X86::DIV_F32m},
+ {X86::DIV_Fp80m64, X86::DIV_F64m},
+ {X86::DIV_FpI16m32, X86::DIV_FI16m},
+ {X86::DIV_FpI16m64, X86::DIV_FI16m},
+ {X86::DIV_FpI16m80, X86::DIV_FI16m},
+ {X86::DIV_FpI32m32, X86::DIV_FI32m},
+ {X86::DIV_FpI32m64, X86::DIV_FI32m},
+ {X86::DIV_FpI32m80, X86::DIV_FI32m},
+ {X86::ILD_Fp16m32, X86::ILD_F16m},
+ {X86::ILD_Fp16m64, X86::ILD_F16m},
+ {X86::ILD_Fp16m80, X86::ILD_F16m},
+ {X86::ILD_Fp32m32, X86::ILD_F32m},
+ {X86::ILD_Fp32m64, X86::ILD_F32m},
+ {X86::ILD_Fp32m80, X86::ILD_F32m},
+ {X86::ILD_Fp64m32, X86::ILD_F64m},
+ {X86::ILD_Fp64m64, X86::ILD_F64m},
+ {X86::ILD_Fp64m80, X86::ILD_F64m},
+ {X86::ISTT_Fp16m32, X86::ISTT_FP16m},
+ {X86::ISTT_Fp16m64, X86::ISTT_FP16m},
+ {X86::ISTT_Fp16m80, X86::ISTT_FP16m},
+ {X86::ISTT_Fp32m32, X86::ISTT_FP32m},
+ {X86::ISTT_Fp32m64, X86::ISTT_FP32m},
+ {X86::ISTT_Fp32m80, X86::ISTT_FP32m},
+ {X86::ISTT_Fp64m32, X86::ISTT_FP64m},
+ {X86::ISTT_Fp64m64, X86::ISTT_FP64m},
+ {X86::ISTT_Fp64m80, X86::ISTT_FP64m},
+ {X86::IST_Fp16m32, X86::IST_F16m},
+ {X86::IST_Fp16m64, X86::IST_F16m},
+ {X86::IST_Fp16m80, X86::IST_F16m},
+ {X86::IST_Fp32m32, X86::IST_F32m},
+ {X86::IST_Fp32m64, X86::IST_F32m},
+ {X86::IST_Fp32m80, X86::IST_F32m},
+ {X86::IST_Fp64m32, X86::IST_FP64m},
+ {X86::IST_Fp64m64, X86::IST_FP64m},
+ {X86::IST_Fp64m80, X86::IST_FP64m},
+ {X86::LD_Fp032, X86::LD_F0},
+ {X86::LD_Fp064, X86::LD_F0},
+ {X86::LD_Fp080, X86::LD_F0},
+ {X86::LD_Fp132, X86::LD_F1},
+ {X86::LD_Fp164, X86::LD_F1},
+ {X86::LD_Fp180, X86::LD_F1},
+ {X86::LD_Fp32m, X86::LD_F32m},
+ {X86::LD_Fp32m64, X86::LD_F32m},
+ {X86::LD_Fp32m80, X86::LD_F32m},
+ {X86::LD_Fp64m, X86::LD_F64m},
+ {X86::LD_Fp64m80, X86::LD_F64m},
+ {X86::LD_Fp80m, X86::LD_F80m},
+ {X86::MUL_Fp32m, X86::MUL_F32m},
+ {X86::MUL_Fp64m, X86::MUL_F64m},
+ {X86::MUL_Fp64m32, X86::MUL_F32m},
+ {X86::MUL_Fp80m32, X86::MUL_F32m},
+ {X86::MUL_Fp80m64, X86::MUL_F64m},
+ {X86::MUL_FpI16m32, X86::MUL_FI16m},
+ {X86::MUL_FpI16m64, X86::MUL_FI16m},
+ {X86::MUL_FpI16m80, X86::MUL_FI16m},
+ {X86::MUL_FpI32m32, X86::MUL_FI32m},
+ {X86::MUL_FpI32m64, X86::MUL_FI32m},
+ {X86::MUL_FpI32m80, X86::MUL_FI32m},
+ {X86::SQRT_Fp32, X86::SQRT_F},
+ {X86::SQRT_Fp64, X86::SQRT_F},
+ {X86::SQRT_Fp80, X86::SQRT_F},
+ {X86::ST_Fp32m, X86::ST_F32m},
+ {X86::ST_Fp64m, X86::ST_F64m},
+ {X86::ST_Fp64m32, X86::ST_F32m},
+ {X86::ST_Fp80m32, X86::ST_F32m},
+ {X86::ST_Fp80m64, X86::ST_F64m},
+ {X86::ST_FpP80m, X86::ST_FP80m},
+ {X86::SUBR_Fp32m, X86::SUBR_F32m},
+ {X86::SUBR_Fp64m, X86::SUBR_F64m},
+ {X86::SUBR_Fp64m32, X86::SUBR_F32m},
+ {X86::SUBR_Fp80m32, X86::SUBR_F32m},
+ {X86::SUBR_Fp80m64, X86::SUBR_F64m},
+ {X86::SUBR_FpI16m32, X86::SUBR_FI16m},
+ {X86::SUBR_FpI16m64, X86::SUBR_FI16m},
+ {X86::SUBR_FpI16m80, X86::SUBR_FI16m},
+ {X86::SUBR_FpI32m32, X86::SUBR_FI32m},
+ {X86::SUBR_FpI32m64, X86::SUBR_FI32m},
+ {X86::SUBR_FpI32m80, X86::SUBR_FI32m},
+ {X86::SUB_Fp32m, X86::SUB_F32m},
+ {X86::SUB_Fp64m, X86::SUB_F64m},
+ {X86::SUB_Fp64m32, X86::SUB_F32m},
+ {X86::SUB_Fp80m32, X86::SUB_F32m},
+ {X86::SUB_Fp80m64, X86::SUB_F64m},
+ {X86::SUB_FpI16m32, X86::SUB_FI16m},
+ {X86::SUB_FpI16m64, X86::SUB_FI16m},
+ {X86::SUB_FpI16m80, X86::SUB_FI16m},
+ {X86::SUB_FpI32m32, X86::SUB_FI32m},
+ {X86::SUB_FpI32m64, X86::SUB_FI32m},
+ {X86::SUB_FpI32m80, X86::SUB_FI32m},
+ {X86::TST_Fp32, X86::TST_F},
+ {X86::TST_Fp64, X86::TST_F},
+ {X86::TST_Fp80, X86::TST_F},
+ {X86::UCOM_FpIr32, X86::UCOM_FIr},
+ {X86::UCOM_FpIr64, X86::UCOM_FIr},
+ {X86::UCOM_FpIr80, X86::UCOM_FIr},
+ {X86::UCOM_Fpr32, X86::UCOM_Fr},
+ {X86::UCOM_Fpr64, X86::UCOM_Fr},
+ {X86::UCOM_Fpr80, X86::UCOM_Fr},
+ {X86::XAM_Fp32, X86::XAM_F},
+ {X86::XAM_Fp64, X86::XAM_F},
+ {X86::XAM_Fp80, X86::XAM_F},
};
static unsigned getConcreteOpcode(unsigned Opcode) {
@@ -817,31 +841,25 @@ static unsigned getConcreteOpcode(unsigned Opcode) {
// element is an instruction, the second is the version which pops.
//
static const TableEntry PopTable[] = {
- { X86::ADD_FrST0 , X86::ADD_FPrST0 },
+ {X86::ADD_FrST0, X86::ADD_FPrST0},
- { X86::COMP_FST0r, X86::FCOMPP },
- { X86::COM_FIr , X86::COM_FIPr },
- { X86::COM_FST0r , X86::COMP_FST0r },
+ {X86::COMP_FST0r, X86::FCOMPP}, {X86::COM_FIr, X86::COM_FIPr},
+ {X86::COM_FST0r, X86::COMP_FST0r},
- { X86::DIVR_FrST0, X86::DIVR_FPrST0 },
- { X86::DIV_FrST0 , X86::DIV_FPrST0 },
+ {X86::DIVR_FrST0, X86::DIVR_FPrST0}, {X86::DIV_FrST0, X86::DIV_FPrST0},
- { X86::IST_F16m , X86::IST_FP16m },
- { X86::IST_F32m , X86::IST_FP32m },
+ {X86::IST_F16m, X86::IST_FP16m}, {X86::IST_F32m, X86::IST_FP32m},
- { X86::MUL_FrST0 , X86::MUL_FPrST0 },
+ {X86::MUL_FrST0, X86::MUL_FPrST0},
- { X86::ST_F32m , X86::ST_FP32m },
- { X86::ST_F64m , X86::ST_FP64m },
- { X86::ST_Frr , X86::ST_FPrr },
+ {X86::ST_F32m, X86::ST_FP32m}, {X86::ST_F64m, X86::ST_FP64m},
+ {X86::ST_Frr, X86::ST_FPrr},
- { X86::SUBR_FrST0, X86::SUBR_FPrST0 },
- { X86::SUB_FrST0 , X86::SUB_FPrST0 },
+ {X86::SUBR_FrST0, X86::SUBR_FPrST0}, {X86::SUB_FrST0, X86::SUB_FPrST0},
- { X86::UCOM_FIr , X86::UCOM_FIPr },
+ {X86::UCOM_FIr, X86::UCOM_FIPr},
- { X86::UCOM_FPr , X86::UCOM_FPPr },
- { X86::UCOM_Fr , X86::UCOM_FPr },
+ {X86::UCOM_FPr, X86::UCOM_FPPr}, {X86::UCOM_Fr, X86::UCOM_FPr},
};
static bool doesInstructionSetFPSW(MachineInstr &MI) {
@@ -883,7 +901,7 @@ void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
if (Opcode == X86::FCOMPP || Opcode == X86::UCOM_FPPr)
I->removeOperand(0);
MI.dropDebugNumber();
- } else { // Insert an explicit pop
+ } else { // Insert an explicit pop
// If this instruction sets FPSW, which is read in following instruction,
// insert pop after that reader.
if (doesInstructionSetFPSW(MI)) {
@@ -901,7 +919,7 @@ void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
/// of the stack, we just pop the current instruction, otherwise we store the
/// current top-of-stack into the specified slot, then pop the top of stack.
void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) {
- if (getStackEntry(0) == FPRegNo) { // already at the top of stack? easy.
+ if (getStackEntry(0) == FPRegNo) { // already at the top of stack? easy.
popStackAfter(I);
return;
}
@@ -916,12 +934,12 @@ void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) {
/// folding.
MachineBasicBlock::iterator
FPS::freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo) {
- unsigned STReg = getSTReg(FPRegNo);
- unsigned OldSlot = getSlot(FPRegNo);
- unsigned TopReg = Stack[StackTop-1];
- Stack[OldSlot] = TopReg;
- RegMap[TopReg] = OldSlot;
- RegMap[FPRegNo] = ~0;
+ unsigned STReg = getSTReg(FPRegNo);
+ unsigned OldSlot = getSlot(FPRegNo);
+ unsigned TopReg = Stack[StackTop - 1];
+ Stack[OldSlot] = TopReg;
+ RegMap[TopReg] = OldSlot;
+ RegMap[FPRegNo] = ~0;
Stack[--StackTop] = ~0;
return BuildMI(*MBB, I, DebugLoc(), TII->get(X86::ST_FPrr))
.addReg(STReg)
@@ -978,7 +996,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
}
// Load zeros for all the imp-defs.
- while(Defs) {
+ while (Defs) {
unsigned DReg = llvm::countr_zero(Defs);
LLVM_DEBUG(dbgs() << "Defining %fp" << DReg << " as 0\n");
BuildMI(*MBB, I, DebugLoc(), TII->get(X86::LD_F0));
@@ -994,8 +1012,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
/// shuffleStackTop - emit fxch instructions before I to shuffle the top
/// FixCount entries into the order given by FixStack.
/// FIXME: Is there a better algorithm than insertion sort?
-void FPS::shuffleStackTop(const unsigned char *FixStack,
- unsigned FixCount,
+void FPS::shuffleStackTop(const unsigned char *FixStack, unsigned FixCount,
MachineBasicBlock::iterator I) {
// Move items into place, starting from the desired stack bottom.
while (FixCount--) {
@@ -1013,7 +1030,6 @@ void FPS::shuffleStackTop(const unsigned char *FixStack,
LLVM_DEBUG(dumpStack());
}
-
//===----------------------------------------------------------------------===//
// Instruction transformation implementation
//===----------------------------------------------------------------------===//
@@ -1122,7 +1138,8 @@ void FPS::handleReturn(MachineBasicBlock::iterator &I) {
// We may have been carrying spurious live-ins, so make sure only the
// returned registers are left live.
adjustLiveRegs(LiveMask, MI);
- if (!LiveMask) return; // Quick check to see if any are possible.
+ if (!LiveMask)
+ return; // Quick check to see if any are possible.
// There are only four possibilities here:
// 1) we are returning a single FP value. In this case, it has to be in
@@ -1144,7 +1161,7 @@ void FPS::handleReturn(MachineBasicBlock::iterator &I) {
// 2) If returning the same value for both, we only have one thing in the FP
// stack. Consider: RET FP1, FP1
if (StackTop == 1) {
- assert(FirstFPRegOp == SecondFPRegOp && FirstFPRegOp == getStackEntry(0)&&
+ assert(FirstFPRegOp == SecondFPRegOp && FirstFPRegOp == getStackEntry(0) &&
"Stack misconfiguration for RET!");
// Duplicate the TOS so that we return it twice. Just pick some other FPx
@@ -1222,7 +1239,7 @@ void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
MI.getOpcode() == X86::ST_FpP80m)) {
duplicateToTop(Reg, ScratchFPReg, I);
} else {
- moveToTop(Reg, I); // Move to the top of the stack...
+ moveToTop(Reg, I); // Move to the top of the stack...
}
// Convert from the pseudo instruction to the concrete instruction.
@@ -1244,7 +1261,6 @@ void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
MI.dropDebugNumber();
}
-
/// handleOneArgFPRW: Handle instructions that read from the top of stack and
/// replace the value with a newly computed value. These instructions may have
/// non-fp operands after their FP operands.
@@ -1285,76 +1301,62 @@ void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) {
MI.dropDebugNumber();
}
-
//===----------------------------------------------------------------------===//
// Define tables of various ways to map pseudo instructions
//
// ForwardST0Table - Map: A = B op C into: ST(0) = ST(0) op ST(i)
static const TableEntry ForwardST0Table[] = {
- { X86::ADD_Fp32 , X86::ADD_FST0r },
- { X86::ADD_Fp64 , X86::ADD_FST0r },
- { X86::ADD_Fp80 , X86::ADD_FST0r },
- { X86::DIV_Fp32 , X86::DIV_FST0r },
- { X86::DIV_Fp64 , X86::DIV_FST0r },
- { X86::DIV_Fp80 , X86::DIV_FST0r },
- { X86::MUL_Fp32 , X86::MUL_FST0r },
- { X86::MUL_Fp64 , X86::MUL_FST0r },
- { X86::MUL_Fp80 , X86::MUL_FST0r },
- { X86::SUB_Fp32 , X86::SUB_FST0r },
- { X86::SUB_Fp64 , X86::SUB_FST0r },
- { X86::SUB_Fp80 , X86::SUB_FST0r },
+ {X86::ADD_Fp32, X86::ADD_FST0r}, {X86::ADD_Fp64, X86::ADD_FST0r},
+ {X86::ADD_Fp80, X86::ADD_FST0r}, {X86::DIV_Fp32, X86::DIV_FST0r},
+ {X86::DIV_Fp64, X86::DIV_FST0r}, {X86::DIV_Fp80, X86::DIV_FST0r},
+ {X86::MUL_Fp32, X86::MUL_FST0r}, {X86::MUL_Fp64, X86::MUL_FST0r},
+ {X86::MUL_Fp80, X86::MUL_FST0r}, {X86::SUB_Fp32, X86::SUB_FST0r},
+ {X86::SUB_Fp64, X86::SUB_FST0r}, {X86::SUB_Fp80, X86::SUB_FST0r},
};
// ReverseST0Table - Map: A = B op C into: ST(0) = ST(i) op ST(0)
static const TableEntry ReverseST0Table[] = {
- { X86::ADD_Fp32 , X86::ADD_FST0r }, // commutative
- { X86::ADD_Fp64 , X86::ADD_FST0r }, // commutative
- { X86::ADD_Fp80 , X86::ADD_FST0r }, // commutative
- { X86::DIV_Fp32 , X86::DIVR_FST0r },
- { X86::DIV_Fp64 , X86::DIVR_FST0r },
- { X86::DIV_Fp80 , X86::DIVR_FST0r },
- { X86::MUL_Fp32 , X86::MUL_FST0r }, // commutative
- { X86::MUL_Fp64 , X86::MUL_FST0r }, // commutative
- { X86::MUL_Fp80 , X86::MUL_FST0r }, // commutative
- { X86::SUB_Fp32 , X86::SUBR_FST0r },
- { X86::SUB_Fp64 , X86::SUBR_FST0r },
- { X86::SUB_Fp80 , X86::SUBR_FST0r },
+ {X86::ADD_Fp32, X86::ADD_FST0r}, // commutative
+ {X86::ADD_Fp64, X86::ADD_FST0r}, // commutative
+ {X86::ADD_Fp80, X86::ADD_FST0r}, // commutative
+ {X86::DIV_Fp32, X86::DIVR_FST0r},
+ {X86::DIV_Fp64, X86::DIVR_FST0r},
+ {X86::DIV_Fp80, X86::DIVR_FST0r},
+ {X86::MUL_Fp32, X86::MUL_FST0r}, // commutative
+ {X86::MUL_Fp64, X86::MUL_FST0r}, // commutative
+ {X86::MUL_Fp80, X86::MUL_FST0r}, // commutative
+ {X86::SUB_Fp32, X86::SUBR_FST0r},
+ {X86::SUB_Fp64, X86::SUBR_FST0r},
+ {X86::SUB_Fp80, X86::SUBR_FST0r},
};
// ForwardSTiTable - Map: A = B op C into: ST(i) = ST(0) op ST(i)
static const TableEntry ForwardSTiTable[] = {
- { X86::ADD_Fp32 , X86::ADD_FrST0 }, // commutative
- { X86::ADD_Fp64 , X86::ADD_FrST0 }, // commutative
- { X86::ADD_Fp80 , X86::ADD_FrST0 }, // commutative
- { X86::DIV_Fp32 , X86::DIVR_FrST0 },
- { X86::DIV_Fp64 , X86::DIVR_FrST0 },
- { X86::DIV_Fp80 , X86::DIVR_FrST0 },
- { X86::MUL_Fp32 , X86::MUL_FrST0 }, // commutative
- { X86::MUL_Fp64 , X86::MUL_FrST0 }, // commutative
- { X86::MUL_Fp80 , X86::MUL_FrST0 }, // commutative
- { X86::SUB_Fp32 , X86::SUBR_FrST0 },
- { X86::SUB_Fp64 , X86::SUBR_FrST0 },
- { X86::SUB_Fp80 , X86::SUBR_FrST0 },
+ {X86::ADD_Fp32, X86::ADD_FrST0}, // commutative
+ {X86::ADD_Fp64, X86::ADD_FrST0}, // commutative
+ {X86::ADD_Fp80, X86::ADD_FrST0}, // commutative
+ {X86::DIV_Fp32, X86::DIVR_FrST0},
+ {X86::DIV_Fp64, X86::DIVR_FrST0},
+ {X86::DIV_Fp80, X86::DIVR_FrST0},
+ {X86::MUL_Fp32, X86::MUL_FrST0}, // commutative
+ {X86::MUL_Fp64, X86::MUL_FrST0}, // commutative
+ {X86::MUL_Fp80, X86::MUL_FrST0}, // commutative
+ {X86::SUB_Fp32, X86::SUBR_FrST0},
+ {X86::SUB_Fp64, X86::SUBR_FrST0},
+ {X86::SUB_Fp80, X86::SUBR_FrST0},
};
// ReverseSTiTable - Map: A = B op C into: ST(i) = ST(i) op ST(0)
static const TableEntry ReverseSTiTable[] = {
- { X86::ADD_Fp32 , X86::ADD_FrST0 },
- { X86::ADD_Fp64 , X86::ADD_FrST0 },
- { X86::ADD_Fp80 , X86::ADD_FrST0 },
- { X86::DIV_Fp32 , X86::DIV_FrST0 },
- { X86::DIV_Fp64 , X86::DIV_FrST0 },
- { X86::DIV_Fp80 , X86::DIV_FrST0 },
- { X86::MUL_Fp32 , X86::MUL_FrST0 },
- { X86::MUL_Fp64 , X86::MUL_FrST0 },
- { X86::MUL_Fp80 , X86::MUL_FrST0 },
- { X86::SUB_Fp32 , X86::SUB_FrST0 },
- { X86::SUB_Fp64 , X86::SUB_FrST0 },
- { X86::SUB_Fp80 , X86::SUB_FrST0 },
+ {X86::ADD_Fp32, X86::ADD_FrST0}, {X86::ADD_Fp64, X86::ADD_FrST0},
+ {X86::ADD_Fp80, X86::ADD_FrST0}, {X86::DIV_Fp32, X86::DIV_FrST0},
+ {X86::DIV_Fp64, X86::DIV_FrST0}, {X86::DIV_Fp80, X86::DIV_FrST0},
+ {X86::MUL_Fp32, X86::MUL_FrST0}, {X86::MUL_Fp64, X86::MUL_FrST0},
+ {X86::MUL_Fp80, X86::MUL_FrST0}, {X86::SUB_Fp32, X86::SUB_FrST0},
+ {X86::SUB_Fp64, X86::SUB_FrST0}, {X86::SUB_Fp80, X86::SUB_FrST0},
};
-
/// handleTwoArgFP - Handle instructions like FADD and friends which are virtual
/// instructions which need to be simplified and possibly transformed.
///
@@ -1364,8 +1366,10 @@ static const TableEntry ReverseSTiTable[] = {
/// ST(i) = fsubr ST(0), ST(i)
///
void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
- ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
- ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+ ASSERT_SORTED(ForwardST0Table);
+ ASSERT_SORTED(ReverseST0Table);
+ ASSERT_SORTED(ForwardSTiTable);
+ ASSERT_SORTED(ReverseSTiTable);
MachineInstr &MI = *I;
unsigned NumOperands = MI.getDesc().getNumOperands();
@@ -1381,12 +1385,12 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
// One of our operands must be on the top of the stack. If neither is yet, we
// need to move one.
- if (Op0 != TOS && Op1 != TOS) { // No operand at TOS?
+ if (Op0 != TOS && Op1 != TOS) { // No operand at TOS?
// We can choose to move either operand to the top of the stack. If one of
// the operands is killed by this instruction, we want that one so that we
// can update right on top of the old version.
if (KillsOp0) {
- moveToTop(Op0, I); // Move dead operand to TOS.
+ moveToTop(Op0, I); // Move dead operand to TOS.
TOS = Op0;
} else if (KillsOp1) {
moveToTop(Op1, I);
@@ -1449,15 +1453,15 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
// overwriting the other one.
if (KillsOp0 && KillsOp1 && Op0 != Op1) {
assert(!updateST0 && "Should have updated other operand!");
- popStackAfter(I); // Pop the top of stack
+ popStackAfter(I); // Pop the top of stack
}
// Update stack information so that we know the destination register is now on
// the stack.
unsigned UpdatedSlot = getSlot(updateST0 ? TOS : NotTOS);
assert(UpdatedSlot < StackTop && Dest < 7);
- Stack[UpdatedSlot] = Dest;
- RegMap[Dest] = UpdatedSlot;
+ Stack[UpdatedSlot] = Dest;
+ RegMap[Dest] = UpdatedSlot;
MBB->getParent()->deleteMachineInstr(&MI); // Remove the old instruction
}
@@ -1485,8 +1489,10 @@ void FPS::handleCompareFP(MachineBasicBlock::iterator &I) {
MI.dropDebugNumber();
// If any of the operands are killed by this instruction, free them.
- if (KillsOp0) freeStackSlotAfter(I, Op0);
- if (KillsOp1 && Op0 != Op1) freeStackSlotAfter(I, Op1);
+ if (KillsOp0)
+ freeStackSlotAfter(I, Op0);
+ if (KillsOp1 && Op0 != Op1)
+ freeStackSlotAfter(I, Op1);
}
/// handleCondMovFP - Handle two address conditional move instructions. These
@@ -1518,7 +1524,6 @@ void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) {
}
}
-
/// handleSpecialFP - Handle special instructions which behave unlike other
/// floating point instructions. This is primarily intended for use by pseudo
/// instructions.
@@ -1537,7 +1542,8 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
}
switch (MI.getOpcode()) {
- default: llvm_unreachable("Unknown SpecialFP instruction!");
+ default:
+ llvm_unreachable("Unknown SpecialFP instruction!");
case TargetOpcode::COPY: {
// We handle three kinds of copies: FP <- FP, FP <- ST, and ST <- FP.
const MachineOperand &MO1 = MI.getOperand(1);
@@ -1770,7 +1776,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
}
}
- Inst = MBB->erase(Inst); // Remove the pseudo instruction
+ Inst = MBB->erase(Inst); // Remove the pseudo instruction
// We want to leave I pointing to the previous instruction, but what if we
// just erased the first instruction?
@@ -1819,3 +1825,29 @@ void FPS::setKillFlags(MachineBasicBlock &MBB) const {
LPR.stepBackward(MI);
}
}
+
+bool X86FPStackifierLegacy::runOnMachineFunction(MachineFunction &MF) {
+ FPS Impl;
+ if (!Impl.shouldRun(MF))
+ return false;
+
+ EdgeBundles *Bundles =
+ &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
+ return FPS().run(MF, Bundles);
+}
+
+PreservedAnalyses
+X86FPStackifierPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ FPS Impl;
+ if (!Impl.shouldRun(MF))
+ return PreservedAnalyses::all();
+
+ EdgeBundles *Bundles = &MFAM.getResult<EdgeBundlesAnalysis>(MF);
+ bool Changed = Impl.run(MF, Bundles);
+ if (!Changed)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA = PreservedAnalyses::none();
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index a66a321..8bca634 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -3093,8 +3093,8 @@ bool X86FrameLowering::spillCalleeSavedRegisters(
MBB.addLiveIn(Reg);
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
- TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, TRI,
- Register(), MachineInstr::FrameSetup);
+ TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, Register(),
+ MachineInstr::FrameSetup);
}
return true;
@@ -3166,8 +3166,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(
VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
- TII.loadRegFromStackSlot(MBB, MI, Reg, I.getFrameIdx(), RC, TRI,
- Register());
+ TII.loadRegFromStackSlot(MBB, MI, Reg, I.getFrameIdx(), RC, Register());
}
// Clear the stack slot for spill base pointer register.
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index d4418c8..e7903a7 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1004,7 +1004,9 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
APInt SplatVal;
- if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
+ if (!ISD::isBuildVectorOfConstantSDNodes(
+ peekThroughBitcasts(N->getOperand(0)).getNode()) &&
+ X86::isConstantSplat(N->getOperand(1), SplatVal) &&
SplatVal.isOne()) {
SDLoc DL(N);
@@ -4728,9 +4730,9 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) {
if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() &&
ISD::isBuildVectorAllOnes(Op->getOperand(1).getNode())) {
- SDValue InnerOp = Op->getOperand(0);
+ SDValue InnerOp = getFoldableLogicOp(Op->getOperand(0));
- if (!getFoldableLogicOp(InnerOp))
+ if (!InnerOp)
return SDValue();
N0 = InnerOp.getOperand(0);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 007074c..fbd875a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -130,7 +130,7 @@ static cl::opt<bool> MulConstantOptimization(
X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
const X86Subtarget &STI)
- : TargetLowering(TM), Subtarget(STI) {
+ : TargetLowering(TM, STI), Subtarget(STI) {
bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
@@ -635,6 +635,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FROUNDEVEN, VT, Action);
setOperationAction(ISD::FTRUNC, VT, Action);
setOperationAction(ISD::FLDEXP, VT, Action);
+ setOperationAction(ISD::FSINCOSPI, VT, Action);
};
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
@@ -2072,8 +2073,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.hasVBMI2()) {
for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
- setOperationAction(ISD::FSHL, VT, Custom);
- setOperationAction(ISD::FSHR, VT, Custom);
+ setOperationAction(ISD::FSHL, VT, Legal);
+ setOperationAction(ISD::FSHR, VT, Legal);
}
setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
@@ -2088,8 +2089,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
MVT::v4i64}) {
- setOperationAction(ISD::FSHL, VT, Custom);
- setOperationAction(ISD::FSHR, VT, Custom);
+ setOperationAction(ISD::FSHL, VT, Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::FSHR, VT, Subtarget.hasVLX() ? Legal : Custom);
}
}
@@ -2097,9 +2098,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// pre-AVX512 equivalents. Without VLX we use 512-bit operations for
// narrower widths.
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
+ for (MVT VT : {MVT::f16, MVT::f32, MVT::f64, MVT::v8f16, MVT::v4f32,
+ MVT::v2f64, MVT::v16f16, MVT::v8f32, MVT::v4f64, MVT::v32f16,
+ MVT::v16f32, MVT::v8f64})
+ setOperationAction(ISD::FLDEXP, VT, Custom);
+
// These operations are handled on non-VLX by artificially widening in
// isel patterns.
-
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
@@ -2150,6 +2155,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
if (Subtarget.hasCDI()) {
+ for (auto VT : {MVT::i256, MVT::i512}) {
+ if (VT == MVT::i512 && !Subtarget.useAVX512Regs())
+ continue;
+ setOperationAction(ISD::CTLZ, VT, Custom);
+ setOperationAction(ISD::CTTZ, VT, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
+ }
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
setOperationAction(ISD::CTLZ, VT, Legal);
}
@@ -2572,8 +2585,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
// Combine sin / cos into _sincos_stret if it is available.
- setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
- setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+ setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
if (Subtarget.isTargetWin64()) {
setOperationAction(ISD::SDIV, MVT::i128, Custom);
@@ -2655,6 +2668,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
ISD::AVGFLOORU,
ISD::BITREVERSE,
ISD::ADD,
+ ISD::SADDSAT,
+ ISD::SSUBSAT,
ISD::FADD,
ISD::FSUB,
ISD::FNEG,
@@ -2694,6 +2709,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
ISD::STRICT_FP_EXTEND,
ISD::FP_ROUND,
ISD::STRICT_FP_ROUND,
+ ISD::FSHL,
+ ISD::FSHR,
ISD::INTRINSIC_VOID,
ISD::INTRINSIC_WO_CHAIN,
ISD::INTRINSIC_W_CHAIN});
@@ -2871,6 +2888,8 @@ static bool isTargetShuffle(unsigned Opcode) {
case X86ISD::VPERMV:
case X86ISD::VPERMV3:
case X86ISD::VZEXT_MOVL:
+ case X86ISD::COMPRESS:
+ case X86ISD::EXPAND:
return true;
}
}
@@ -3087,7 +3106,7 @@ static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
}
bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
- const CallInst &I,
+ const CallBase &I,
MachineFunction &MF,
unsigned Intrinsic) const {
Info.flags = MachineMemOperand::MONone;
@@ -3454,6 +3473,12 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
return true;
+ // If we have a large vector type (even if illegal), don't bitcast to large
+ // (illegal) scalar types. Better to load fewer vectors and extract.
+ if (LoadVT.isVector() && !BitcastVT.isVector() && LoadVT.isInteger() &&
+ BitcastVT.isInteger() && (LoadVT.getSizeInBits() % 128) == 0)
+ return false;
+
return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
}
@@ -5358,12 +5383,12 @@ bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
int getRoundingModeX86(unsigned RM) {
switch (static_cast<::llvm::RoundingMode>(RM)) {
// clang-format off
- case ::llvm::RoundingMode::NearestTiesToEven: return X86::rmToNearest; break;
- case ::llvm::RoundingMode::TowardNegative: return X86::rmDownward; break;
- case ::llvm::RoundingMode::TowardPositive: return X86::rmUpward; break;
- case ::llvm::RoundingMode::TowardZero: return X86::rmTowardZero; break;
- default:
- return X86::rmInvalid; // Invalid rounding mode
+ case ::llvm::RoundingMode::NearestTiesToEven: return X86::rmToNearest;
+ case ::llvm::RoundingMode::TowardNegative: return X86::rmDownward;
+ case ::llvm::RoundingMode::TowardPositive: return X86::rmUpward;
+ case ::llvm::RoundingMode::TowardZero: return X86::rmTowardZero;
+ default: return X86::rmInvalid;
+ // clang-format on
}
}
@@ -5816,6 +5841,48 @@ static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
}
return false;
}
+ case X86ISD::COMPRESS: {
+ SDValue CmpVec = N.getOperand(0);
+ SDValue PassThru = N.getOperand(1);
+ SDValue CmpMask = N.getOperand(2);
+ APInt UndefElts;
+ SmallVector<APInt> EltBits;
+ if (!getTargetConstantBitsFromNode(CmpMask, 1, UndefElts, EltBits))
+ return false;
+ assert(UndefElts.getBitWidth() == NumElems && EltBits.size() == NumElems &&
+ "Illegal compression mask");
+ for (unsigned I = 0; I != NumElems; ++I) {
+ if (!EltBits[I].isZero())
+ Mask.push_back(I);
+ }
+ while (Mask.size() != NumElems) {
+ Mask.push_back(NumElems + Mask.size());
+ }
+ Ops.push_back(CmpVec);
+ Ops.push_back(PassThru);
+ return true;
+ }
+ case X86ISD::EXPAND: {
+ SDValue ExpVec = N.getOperand(0);
+ SDValue PassThru = N.getOperand(1);
+ SDValue ExpMask = N.getOperand(2);
+ APInt UndefElts;
+ SmallVector<APInt> EltBits;
+ if (!getTargetConstantBitsFromNode(ExpMask, 1, UndefElts, EltBits))
+ return false;
+ assert(UndefElts.getBitWidth() == NumElems && EltBits.size() == NumElems &&
+ "Illegal expansion mask");
+ unsigned ExpIndex = 0;
+ for (unsigned I = 0; I != NumElems; ++I) {
+ if (EltBits[I].isZero())
+ Mask.push_back(I + NumElems);
+ else
+ Mask.push_back(ExpIndex++);
+ }
+ Ops.push_back(ExpVec);
+ Ops.push_back(PassThru);
+ return true;
+ }
default:
llvm_unreachable("unknown target shuffle node");
}
@@ -7270,7 +7337,10 @@ static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
const SDLoc &DL, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
- bool IsAfterLegalize) {
+ bool IsAfterLegalize,
+ unsigned Depth = 0) {
+ if (Depth >= SelectionDAG::MaxRecursionDepth)
+ return SDValue(); // Limit search depth.
if ((VT.getScalarSizeInBits() % 8) != 0)
return SDValue();
@@ -7444,7 +7514,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
SDValue HalfLD =
EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
- DAG, Subtarget, IsAfterLegalize);
+ DAG, Subtarget, IsAfterLegalize, Depth + 1);
if (HalfLD)
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
HalfLD, DAG.getVectorIdxConstant(0, DL));
@@ -7521,7 +7591,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
VT.getSizeInBits() / ScalarSize);
if (TLI.isTypeLegal(BroadcastVT)) {
if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
- RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
+ RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize,
+ Depth + 1)) {
SDValue Broadcast = RepeatLoad;
if (RepeatSize > ScalarSize) {
while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
@@ -7542,6 +7613,20 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
}
}
+ // REVERSE - attempt to match the loads in reverse and then shuffle back.
+ // TODO: Do this for any permute or mismatching element counts.
+ if (Depth == 0 && ZeroMask.isZero() && UndefMask.isZero() &&
+ TLI.isTypeLegal(VT) && VT.isVector() &&
+ NumElems == VT.getVectorNumElements()) {
+ SmallVector<SDValue, 16> ReverseElts(Elts.rbegin(), Elts.rend());
+ if (SDValue RevLd = EltsFromConsecutiveLoads(
+ VT, ReverseElts, DL, DAG, Subtarget, IsAfterLegalize, Depth + 1)) {
+ SmallVector<int, 16> ReverseMask(NumElems);
+ std::iota(ReverseMask.rbegin(), ReverseMask.rend(), 0);
+ return DAG.getVectorShuffle(VT, DL, RevLd, DAG.getUNDEF(VT), ReverseMask);
+ }
+ }
+
return SDValue();
}
@@ -7948,7 +8033,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL,
for (unsigned i = 0; i != NumElems; ++i) {
unsigned Opc = Op.getOperand(i).getOpcode();
- if (Opc == ISD::UNDEF)
+ if (Opc == ISD::POISON || Opc == ISD::UNDEF)
continue;
if (Opc != ISD::EXTRACT_VECTOR_ELT) {
@@ -7991,7 +8076,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL,
if (!VecIn1.getNode())
return SDValue();
- VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
+ VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getPOISON(VT);
SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
for (unsigned Idx : InsertIndices)
@@ -8115,6 +8200,8 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl,
case X86ISD::FHSUB:
case X86ISD::HADD:
case X86ISD::HSUB:
+ case X86ISD::HADDS:
+ case X86ISD::HSUBS:
return true;
}
return false;
@@ -8426,9 +8513,7 @@ static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
// DAGCombiner::visitFADDForFMACombine. It would be good to have one
// function that would answer if it is Ok to fuse MUL + ADD to FMADD
// or MUL + ADDSUB to FMADDSUB.
- const TargetOptions &Options = DAG.getTarget().Options;
bool AllowFusion =
- Options.AllowFPOpFusion == FPOpFusion::Fast ||
(AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract());
if (!AllowFusion)
return false;
@@ -8856,6 +8941,56 @@ static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL,
return SDValue();
}
+/// Widen a BUILD_VECTOR if the scalar operands are freely mergeable.
+static SDValue widenBuildVector(BuildVectorSDNode *BVOp, SDLoc const &DL,
+ X86Subtarget const &Subtarget,
+ SelectionDAG &DAG) {
+ using namespace SDPatternMatch;
+ MVT VT = BVOp->getSimpleValueType(0);
+ MVT SVT = VT.getScalarType();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned EltBits = SVT.getSizeInBits();
+
+ if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
+ return SDValue();
+
+ unsigned WideBits = 2 * EltBits;
+ MVT WideSVT = MVT::getIntegerVT(WideBits);
+ MVT WideVT = MVT::getVectorVT(WideSVT, NumElts / 2);
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(WideSVT))
+ return SDValue();
+
+ SmallVector<SDValue, 8> WideOps;
+ for (unsigned I = 0; I != NumElts; I += 2) {
+ SDValue Op0 = BVOp->getOperand(I + 0);
+ SDValue Op1 = BVOp->getOperand(I + 1);
+
+ if (Op0.isUndef() && Op1.isUndef()) {
+ WideOps.push_back(DAG.getUNDEF(WideSVT));
+ continue;
+ }
+
+ // TODO: Constant repacking?
+
+ // Merge scalars that have been split from the same source.
+ SDValue X, Y;
+ if (sd_match(Op0, m_Trunc(m_Value(X))) &&
+ sd_match(Op1, m_Trunc(m_Srl(m_Value(Y), m_SpecificInt(EltBits)))) &&
+ peekThroughTruncates(X) == peekThroughTruncates(Y) &&
+ X.getValueType().bitsGE(WideSVT)) {
+ if (X.getValueType().bitsGT(WideSVT))
+ X = DAG.getNode(ISD::TRUNCATE, DL, WideSVT, X);
+ WideOps.push_back(X);
+ continue;
+ }
+
+ return SDValue();
+ }
+
+ assert(WideOps.size() == (NumElts / 2) && "Failed to widen build vector");
+ return DAG.getBitcast(VT, DAG.getBuildVector(WideVT, DL, WideOps));
+}
+
/// Create a vector constant without a load. SSE/AVX provide the bare minimum
/// functionality to do this, so it's all zeros, all ones, or some derivation
/// that is cheap to calculate.
@@ -9326,6 +9461,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return BitOp;
if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
return Blend;
+ if (SDValue WideBV = widenBuildVector(BV, dl, Subtarget, DAG))
+ return WideBV;
unsigned NumZero = ZeroMask.popcount();
unsigned NumNonZero = NonZeroMask.popcount();
@@ -18370,16 +18507,20 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
SmallVector<int> Mask(OrigMask);
// Canonicalize the shuffle with any horizontal ops inputs.
+ // Don't attempt this if the shuffle can still be widened as we may lose
+ // whole lane shuffle patterns.
// NOTE: This may update Ops and Mask.
- if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
- Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
- return DAG.getBitcast(VT, HOp);
+ if (!canWidenShuffleElements(Mask)) {
+ if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
+ Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
+ return DAG.getBitcast(VT, HOp);
- V1 = DAG.getBitcast(VT, Ops[0]);
- V2 = DAG.getBitcast(VT, Ops[1]);
- assert(NumElements == (int)Mask.size() &&
- "canonicalizeShuffleMaskWithHorizOp "
- "shouldn't alter the shuffle mask size");
+ V1 = DAG.getBitcast(VT, Ops[0]);
+ V2 = DAG.getBitcast(VT, Ops[1]);
+ assert(NumElements == (int)Mask.size() &&
+ "canonicalizeShuffleMaskWithHorizOp "
+ "shouldn't alter the shuffle mask size");
+ }
// Canonicalize zeros/ones/fp splat constants to ensure no undefs.
// These will be materialized uniformly anyway, so make splat matching easier.
@@ -19142,6 +19283,72 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
return SDValue();
}
+static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ SDValue X = Op.getOperand(0);
+ MVT XTy = X.getSimpleValueType();
+ SDValue Exp = Op.getOperand(1);
+
+ switch (XTy.SimpleTy) {
+ default:
+ return SDValue();
+ case MVT::f16:
+ if (!Subtarget.hasFP16())
+ X = DAG.getFPExtendOrRound(X, DL, MVT::f32);
+ [[fallthrough]];
+ case MVT::f32:
+ case MVT::f64: {
+ MVT VT = MVT::getVectorVT(X.getSimpleValueType(),
+ 128 / X.getSimpleValueType().getSizeInBits());
+ Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp);
+ SDValue VX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, X);
+ SDValue VExp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Exp);
+ SDValue Scalefs = DAG.getNode(X86ISD::SCALEFS, DL, VT, VX, VExp);
+ SDValue Final = DAG.getExtractVectorElt(DL, X.getValueType(), Scalefs, 0);
+ return DAG.getFPExtendOrRound(Final, DL, XTy);
+ }
+ case MVT::v4f32:
+ case MVT::v2f64:
+ case MVT::v8f32:
+ case MVT::v4f64:
+ case MVT::v16f32:
+ case MVT::v8f64:
+ if (XTy.getSizeInBits() == 512 || Subtarget.hasVLX()) {
+ Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
+ return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
+ }
+ break;
+ case MVT::v8f16:
+ case MVT::v16f16:
+ if (Subtarget.hasFP16()) {
+ if (Subtarget.hasVLX()) {
+ Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
+ return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
+ }
+ break;
+ }
+ X = DAG.getFPExtendOrRound(X, DL, XTy.changeVectorElementType(MVT::f32));
+ Exp = DAG.getSExtOrTrunc(Exp, DL,
+ X.getSimpleValueType().changeTypeToInteger());
+ break;
+ case MVT::v32f16:
+ if (Subtarget.hasFP16()) {
+ Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
+ return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
+ }
+ return splitVectorOp(Op, DAG, DL);
+ }
+ SDValue WideX = widenSubVector(X, true, Subtarget, DAG, DL, 512);
+ SDValue WideExp = widenSubVector(Exp, true, Subtarget, DAG, DL, 512);
+ Exp = DAG.getNode(ISD::SINT_TO_FP, DL, WideExp.getSimpleValueType(), Exp);
+ SDValue Scalef =
+ DAG.getNode(X86ISD::SCALEF, DL, WideX.getValueType(), WideX, WideExp);
+ SDValue Final =
+ DAG.getExtractSubvector(DL, X.getSimpleValueType(), Scalef, 0);
+ return DAG.getFPExtendOrRound(Final, DL, XTy);
+}
+
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
@@ -22861,6 +23068,13 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
if (!OpVT.isScalarInteger() || OpSize < 128)
return SDValue();
+ // Don't do this if we're not supposed to use the FPU.
+ bool NoImplicitFloatOps =
+ DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat);
+ if (Subtarget.useSoftFloat() || NoImplicitFloatOps)
+ return SDValue();
+
// Ignore a comparison with zero because that gets special treatment in
// EmitTest(). But make an exception for the special case of a pair of
// logically-combined vector-sized operands compared to zero. This pattern may
@@ -22883,13 +23097,9 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
// Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
// Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
// Otherwise use PCMPEQ (plus AND) and mask testing.
- bool NoImplicitFloatOps =
- DAG.getMachineFunction().getFunction().hasFnAttribute(
- Attribute::NoImplicitFloat);
- if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
- ((OpSize == 128 && Subtarget.hasSSE2()) ||
- (OpSize == 256 && Subtarget.hasAVX()) ||
- (OpSize == 512 && Subtarget.useAVX512Regs()))) {
+ if ((OpSize == 128 && Subtarget.hasSSE2()) ||
+ (OpSize == 256 && Subtarget.hasAVX()) ||
+ (OpSize == 512 && Subtarget.useAVX512Regs())) {
bool HasPT = Subtarget.hasSSE41();
// PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
@@ -29565,9 +29775,9 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
}
if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
- SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
- SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
+ SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, DAG.getBitcast(ExVT, A),
+ DAG.getBitcast(ExVT, B));
SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
@@ -29583,26 +29793,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
SDValue Undef = DAG.getUNDEF(VT);
SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
-
- SDValue BLo, BHi;
- if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
- // If the RHS is a constant, manually unpackl/unpackh.
- SmallVector<SDValue, 16> LoOps, HiOps;
- for (unsigned i = 0; i != NumElts; i += 16) {
- for (unsigned j = 0; j != 8; ++j) {
- LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
- MVT::i16));
- HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
- MVT::i16));
- }
- }
-
- BLo = DAG.getBuildVector(ExVT, dl, LoOps);
- BHi = DAG.getBuildVector(ExVT, dl, HiOps);
- } else {
- BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
- BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
- }
+ SDValue BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
+ SDValue BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
// Multiply, mask the lower 8bits of the lo/hi results and pack.
SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
@@ -30905,6 +31097,63 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
}
+ if (VT == MVT::v64i8 && Subtarget.canExtendTo512BW()) {
+ // On AVX512BW, we can use variable 16-bit shifts to implement variable
+ // 8-bit shifts. For this, we split the input into two vectors, RLo and RHi.
+ // The i-th lane of RLo contains the (2*i)-th lane of R, and the i-th lane
+ // of RHi contains the (2*i+1)-th lane of R. After shifting, these vectors
+ // can efficiently be merged together using a masked move.
+ MVT ExtVT = MVT::v32i16;
+
+ SDValue RLo, RHi;
+ // Isolate lower and upper lanes of Amt by masking odd lanes in AmtLo and
+ // right shifting AmtHi.
+ SDValue AmtLo = DAG.getNode(ISD::AND, dl, ExtVT, DAG.getBitcast(ExtVT, Amt),
+ DAG.getConstant(0x00ff, dl, ExtVT));
+ SDValue AmtHi = getTargetVShiftByConstNode(
+ X86ISD::VSRLI, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), 8, DAG);
+ switch (Opc) {
+ case ISD::SHL:
+ // Because we shift left, no bits from the high half can influence the low
+ // half, so we don't need to mask RLo. We do however need to mask RHi, to
+ // prevent high bits of an even lane overflowing into low bits of an odd
+ // lane.
+ RLo = DAG.getBitcast(ExtVT, R);
+ RHi = DAG.getNode(ISD::AND, dl, ExtVT, RLo,
+ DAG.getConstant(0xff00, dl, ExtVT));
+ break;
+ case ISD::SRL:
+ // Same idea as above, but this time we need to make sure no low bits of
+ // an odd lane can overflow into high bits of an even lane.
+ RHi = DAG.getBitcast(ExtVT, R);
+ RLo = DAG.getNode(ISD::AND, dl, ExtVT, RHi,
+ DAG.getConstant(0x00ff, dl, ExtVT));
+ break;
+ case ISD::SRA:
+ // For arithmetic right shifts, we want to sign extend each even lane of R
+ // such that the upper half of the corresponding lane of RLo is 0 or -1
+ // depending on the sign bit of the original lane. We do this using 2
+ // immediate shifts.
+ RHi = DAG.getBitcast(ExtVT, R);
+ RLo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, RHi, 8, DAG);
+ RLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExtVT, RLo, 8, DAG);
+ break;
+ default:
+ llvm_unreachable("Unexpected Shift Op");
+ }
+
+ SDValue ShiftedLo =
+ DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RLo, AmtLo));
+ SDValue ShiftedHi =
+ DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RHi, AmtHi));
+
+ // To merge the shifted vectors back together, we select even lanes
+ // from ShiftedLo and odd lanes from ShiftedHi.
+ SDValue SelectMask = DAG.getBitcast(
+ MVT::v64i1, DAG.getConstant(0x5555555555555555, dl, MVT::i64));
+ return DAG.getSelect(dl, VT, SelectMask, ShiftedLo, ShiftedHi);
+ }
+
if (VT == MVT::v16i8 ||
(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
(VT == MVT::v64i8 && Subtarget.hasBWI())) {
@@ -31124,19 +31373,15 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
unsigned NumElts = VT.getVectorNumElements();
- if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
-
- if (IsCstSplat) {
- if (IsFSHR)
- std::swap(Op0, Op1);
- uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
- SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
- return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
- {Op0, Op1, Imm}, DAG, Subtarget);
- }
+ // For non-VLX VBMI2 targets, widen 128/256-bit to 512-bit so
+ // the rest of the lowering/isel can select the VBMI2 forms.
+ // Only Custom types (v8i16, v4i32, v2i64, v16i16, v8i32, v4i64) can
+ // reach LowerFunnelShift with VBMI2 but no VLX, so no type check needed.
+ if (Subtarget.hasVBMI2() && !Subtarget.hasVLX() && EltSizeInBits > 8) {
return getAVX512Node(IsFSHR ? ISD::FSHR : ISD::FSHL, DL, VT,
{Op0, Op1, Amt}, DAG, Subtarget);
}
+
assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
@@ -33001,60 +33246,6 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
}
-static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- SDValue Arg = Op.getOperand(0);
- EVT ArgVT = Arg.getValueType();
- bool isF64 = ArgVT == MVT::f64;
-
- RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
- const char *LibcallName = TLI.getLibcallName(LC);
- if (!LibcallName)
- return SDValue();
-
- assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
-
- // For MacOSX, we want to call an alternative entry point: __sincos_stret,
- // which returns the values as { float, float } (in XMM0) or
- // { double, double } (which is returned in XMM0, XMM1).
- SDLoc dl(Op);
- Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-
- TargetLowering::ArgListTy Args;
- Args.emplace_back(Arg, ArgTy);
-
- // Only optimize x86_64 for now. i386 is a bit messy. For f32,
- // the small struct {f32, f32} is returned in (eax, edx). For f64,
- // the results are returned via SRet in memory.
- SDValue Callee =
- DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
-
- Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
- : (Type *)FixedVectorType::get(ArgTy, 4);
-
- TargetLowering::CallLoweringInfo CLI(DAG);
- CLI.setDebugLoc(dl)
- .setChain(DAG.getEntryNode())
- .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
-
- std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
-
- if (isF64)
- // Returned in xmm0 and xmm1.
- return CallResult.first;
-
- // Returned in bits 0:31 and 32:64 xmm0.
- SDValue SinVal =
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
- DAG.getVectorIdxConstant(0, dl));
- SDValue CosVal =
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
- DAG.getVectorIdxConstant(1, dl));
- SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
- return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
-}
-
/// Widen a vector input to a vector of NVT. The
/// input vector must have the same element type as NVT.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
@@ -33659,7 +33850,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ABDS:
case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
- case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
@@ -33669,7 +33859,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
- // clang-format on
+ case ISD::FLDEXP: return LowerFLDEXP(Op, Subtarget, DAG);
+ // clang-format on
}
}
@@ -33753,6 +33944,59 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
return;
}
+ case ISD::CTLZ:
+ case ISD::CTTZ:
+ case ISD::CTLZ_ZERO_UNDEF:
+ case ISD::CTTZ_ZERO_UNDEF: {
+ // Fold i256/i512 CTLZ/CTTZ patterns to make use of AVX512
+ // vXi64 CTLZ/CTTZ and VECTOR_COMPRESS.
+ // Compute the CTLZ/CTTZ of each element, add the element's bit offset,
+ // compress the result to remove all zero elements (passthru is set to
+ // scalar bitwidth if all elements are zero) and extract the lowest
+ // compressed element.
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ assert(Subtarget.hasCDI() && "AVX512CD required");
+ assert((VT == MVT::i256 || VT == MVT::i512) && "Unexpected VT!");
+ if (VT == MVT::i256 && !X86::mayFoldLoad(N0, Subtarget))
+ return;
+
+ unsigned SizeInBits = VT.getSizeInBits();
+ MVT VecVT = MVT::getVectorVT(MVT::i64, SizeInBits / 64);
+ MVT BoolVT = VecVT.changeVectorElementType(MVT::i1);
+ SDValue Vec = DAG.getBitcast(VecVT, N0);
+
+ SmallVector<int, 8> RevMask;
+ SmallVector<SDValue, 8> Offsets;
+ for (unsigned I = 0, E = VecVT.getVectorNumElements(); I != E; ++I) {
+ RevMask.push_back((int)((E - 1) - I));
+ Offsets.push_back(DAG.getConstant(I * 64, dl, MVT::i64));
+ }
+
+ // CTLZ - reverse the elements as we want the top non-zero element at the
+ // bottom for compression.
+ unsigned VecOpc = ISD::CTTZ;
+ if (Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF) {
+ VecOpc = ISD::CTLZ;
+ Vec = DAG.getVectorShuffle(VecVT, dl, Vec, Vec, RevMask);
+ }
+
+ SDValue PassThrough = DAG.getUNDEF(VecVT);
+ if (Opc == ISD::CTLZ || Opc == ISD::CTTZ)
+ PassThrough = DAG.getConstant(SizeInBits, dl, VecVT);
+
+ SDValue IsNonZero = DAG.getSetCC(dl, BoolVT, Vec,
+ DAG.getConstant(0, dl, VecVT), ISD::SETNE);
+ SDValue Cnt = DAG.getNode(VecOpc, dl, VecVT, Vec);
+ Cnt = DAG.getNode(ISD::ADD, dl, VecVT, Cnt,
+ DAG.getBuildVector(VecVT, dl, Offsets));
+ Cnt = DAG.getNode(ISD::VECTOR_COMPRESS, dl, VecVT, Cnt, IsNonZero,
+ PassThrough);
+ Cnt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cnt,
+ DAG.getVectorIdxConstant(0, dl));
+ Results.push_back(DAG.getZExtOrTrunc(Cnt, dl, VT));
+ return;
+ }
case ISD::MUL: {
EVT VT = N->getValueType(0);
assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
@@ -34928,6 +35172,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BLENDV)
NODE_NAME_CASE(HADD)
NODE_NAME_CASE(HSUB)
+ NODE_NAME_CASE(HADDS)
+ NODE_NAME_CASE(HSUBS)
NODE_NAME_CASE(FHADD)
NODE_NAME_CASE(FHSUB)
NODE_NAME_CASE(CONFLICT)
@@ -38165,22 +38411,22 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
default:
llvm_unreachable("Unexpected instruction!");
case X86::PTCVTROWD2PSrri:
- Opc = X86::TCVTROWD2PSrri;
+ Opc = X86::TCVTROWD2PSrti;
break;
case X86::PTCVTROWPS2BF16Hrri:
- Opc = X86::TCVTROWPS2BF16Hrri;
+ Opc = X86::TCVTROWPS2BF16Hrti;
break;
case X86::PTCVTROWPS2PHHrri:
- Opc = X86::TCVTROWPS2PHHrri;
+ Opc = X86::TCVTROWPS2PHHrti;
break;
case X86::PTCVTROWPS2BF16Lrri:
- Opc = X86::TCVTROWPS2BF16Lrri;
+ Opc = X86::TCVTROWPS2BF16Lrti;
break;
case X86::PTCVTROWPS2PHLrri:
- Opc = X86::TCVTROWPS2PHLrri;
+ Opc = X86::TCVTROWPS2PHLrti;
break;
case X86::PTILEMOVROWrri:
- Opc = X86::TILEMOVROWrri;
+ Opc = X86::TILEMOVROWrti;
break;
}
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
@@ -38203,22 +38449,22 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
default:
llvm_unreachable("Unexpected instruction!");
case X86::PTCVTROWD2PSrre:
- Opc = X86::TCVTROWD2PSrre;
+ Opc = X86::TCVTROWD2PSrte;
break;
case X86::PTCVTROWPS2BF16Hrre:
- Opc = X86::TCVTROWPS2BF16Hrre;
+ Opc = X86::TCVTROWPS2BF16Hrte;
break;
case X86::PTCVTROWPS2BF16Lrre:
- Opc = X86::TCVTROWPS2BF16Lrre;
+ Opc = X86::TCVTROWPS2BF16Lrte;
break;
case X86::PTCVTROWPS2PHHrre:
- Opc = X86::TCVTROWPS2PHHrre;
+ Opc = X86::TCVTROWPS2PHHrte;
break;
case X86::PTCVTROWPS2PHLrre:
- Opc = X86::TCVTROWPS2PHLrre;
+ Opc = X86::TCVTROWPS2PHLrte;
break;
case X86::PTILEMOVROWrre:
- Opc = X86::TILEMOVROWrre;
+ Opc = X86::TILEMOVROWrte;
break;
}
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
@@ -40704,8 +40950,9 @@ static SDValue canonicalizeShuffleMaskWithHorizOp(
}))
return SDValue();
- bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
- Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
+ bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::FHSUB ||
+ Opcode0 == X86ISD::HADD || Opcode0 == X86ISD::HSUB ||
+ Opcode0 == X86ISD::HADDS || Opcode0 == X86ISD::HSUBS);
bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
if (!isHoriz && !isPack)
return SDValue();
@@ -45011,11 +45258,16 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
case X86ISD::INSERTPS:
case X86ISD::BLENDI:
case X86ISD::PSHUFB:
+ case X86ISD::VZEXT_MOVL:
case X86ISD::PSHUFD:
+ case X86ISD::PSHUFHW:
+ case X86ISD::PSHUFLW:
+ case X86ISD::SHUFP:
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
case X86ISD::VPERMILPV:
case X86ISD::VPERMILPI:
+ case X86ISD::VPERMI:
case X86ISD::VPERMV:
case X86ISD::VPERMV3: {
SmallVector<int, 8> Mask;
@@ -45041,6 +45293,16 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
}
break;
}
+ case X86ISD::VBROADCAST: {
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ if (SrcVT.isVector()) {
+ APInt DemandedSrc = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
+ return DAG.isGuaranteedNotToBeUndefOrPoison(Src, DemandedSrc, PoisonOnly,
+ Depth + 1);
+ }
+ return DAG.isGuaranteedNotToBeUndefOrPoison(Src, PoisonOnly, Depth + 1);
+ }
}
return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
Op, DemandedElts, DAG, PoisonOnly, Depth);
@@ -45085,13 +45347,19 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
// SSE target shuffles.
case X86ISD::INSERTPS:
case X86ISD::PSHUFB:
+ case X86ISD::VZEXT_MOVL:
case X86ISD::PSHUFD:
+ case X86ISD::PSHUFHW:
+ case X86ISD::PSHUFLW:
+ case X86ISD::SHUFP:
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
case X86ISD::VPERMILPV:
case X86ISD::VPERMILPI:
+ case X86ISD::VPERMI:
case X86ISD::VPERMV:
case X86ISD::VPERMV3:
+ case X86ISD::VBROADCAST:
return false;
// SSE comparisons handle all icmp/fcmp cases.
// TODO: Add CMPM/MM with test coverage.
@@ -53304,18 +53572,48 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
if (Mst->isCompressingStore())
return SDValue();
- EVT VT = Mst->getValue().getValueType();
+ if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
+ return ScalarStore;
+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDLoc DL(N);
- if (Mst->isTruncatingStore())
- return SDValue();
+ SDValue Mask = Mst->getMask();
+ SDValue Value = Mst->getValue();
+ EVT MemVT = Mst->getMemoryVT();
+ EVT VT = Value.getValueType();
- if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
- return ScalarStore;
+ // See if the truncating store can be a saturating truncated store.
+ if (Mst->isTruncatingStore()) {
+ if (VT.isVector() && MemVT.isVector() && VT.getScalarType().isInteger() &&
+ MemVT.getScalarType().isInteger() &&
+ VT.getVectorNumElements() == MemVT.getVectorNumElements() &&
+ Subtarget.hasBWI() && Subtarget.hasVLX()) {
+
+ SDValue SatSrc;
+ unsigned Opc;
+ if (SDValue SVal = detectSSatPattern(Value, MemVT)) {
+ SatSrc = SVal;
+ Opc = X86ISD::VMTRUNCSTORES;
+ } else if (SDValue UVal = detectUSatPattern(Value, MemVT, DAG, DL)) {
+ SatSrc = UVal;
+ Opc = X86ISD::VMTRUNCSTOREUS;
+ } else {
+ return SDValue();
+ }
+
+ SDVTList VTs = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = {Mst->getChain(), SatSrc, Mst->getBasePtr(), Mask};
+ MachineMemOperand *MMO = Mst->getMemOperand();
+ return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
+ }
+
+ // Otherwise don't combine if this store already truncates.
+ return SDValue();
+ }
// If the mask value has been legalized to a non-boolean vector, try to
// simplify ops leading up to it. We only demand the MSB of each lane.
- SDValue Mask = Mst->getMask();
if (Mask.getScalarValueSizeInBits() != 1) {
APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
@@ -53331,14 +53629,12 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
Mst->getAddressingMode());
}
- SDValue Value = Mst->getValue();
if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
- TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
- Mst->getMemoryVT())) {
- return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
- Mst->getBasePtr(), Mst->getOffset(), Mask,
- Mst->getMemoryVT(), Mst->getMemOperand(),
- Mst->getAddressingMode(), true);
+ TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), MemVT)) {
+ return DAG.getMaskedStore(Mst->getChain(), DL, Value.getOperand(0),
+ Mst->getBasePtr(), Mst->getOffset(), Mask, MemVT,
+ Mst->getMemOperand(), Mst->getAddressingMode(),
+ true);
}
return SDValue();
@@ -53349,23 +53645,14 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
// i32 sub value.
static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
using namespace SDPatternMatch;
-
- // Only handle normal stores and its chain was a matching normal load.
- auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
- if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld ||
- !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
- Ld->getBasePtr() != St->getBasePtr() ||
- Ld->getOffset() != St->getOffset())
- return SDValue();
-
- SDValue LoadVal(Ld, 0);
SDValue StoredVal = St->getValue();
EVT VT = StoredVal.getValueType();
- // Only narrow larger than legal scalar integers.
- if (!VT.isScalarInteger() ||
+ // Only narrow normal stores of larger than legal scalar integers.
+ if (!ISD::isNormalStore(St) || !St->isSimple() || !VT.isScalarInteger() ||
VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32))
return SDValue();
@@ -53374,18 +53661,25 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
// BTC: X ^ (1 << ShAmt)
//
// BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt)
- SDValue InsertBit, ShAmt;
- if (!StoredVal.hasOneUse() ||
- !(sd_match(StoredVal, m_And(m_Specific(LoadVal),
+ SDValue SrcVal, InsertBit, ShAmt;
+ if (!(sd_match(StoredVal, m_And(m_Value(SrcVal),
m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
sd_match(StoredVal,
- m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+ m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
sd_match(StoredVal,
- m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
- sd_match(StoredVal,
- m_Or(m_And(m_Specific(LoadVal),
- m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
- m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
+ m_Xor(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+ sd_match(
+ StoredVal,
+ m_Or(m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
+ m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
+ return SDValue();
+
+ // SrcVal must be a matching normal load further up the chain.
+ auto *Ld = dyn_cast<LoadSDNode>(peekThroughBitcasts(SrcVal));
+ if (!Ld || !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
+ Ld->getBasePtr() != St->getBasePtr() ||
+ Ld->getOffset() != St->getOffset() ||
+ !St->getChain().reachesChainWithoutSideEffects(SDValue(Ld, 1)))
return SDValue();
// Ensure the shift amount is in bounds.
@@ -53419,7 +53713,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
SDNodeFlags::NoUnsignedWrap);
// Reconstruct the BTC/BTR/BTS pattern for the i32 block and store.
- SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt);
+ SDValue X = DAG.getNode(ISD::SRL, DL, VT, SrcVal, AlignAmt);
X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32,
@@ -53439,8 +53733,21 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
}
- return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
- Align(), St->getMemOperand()->getFlags());
+ SDValue NewStore =
+ DAG.getStore(St->getChain(), DL, Res, NewPtr,
+ MachinePointerInfo(St->getPointerInfo().getAddrSpace()),
+ Align(), St->getMemOperand()->getFlags());
+
+ // If there are other uses of StoredVal, replace with a new load of the
+ // whole (updated) value.
+ if (!StoredVal.hasOneUse()) {
+ SDValue NewLoad =
+ DAG.getLoad(VT, DL, NewStore, Ld->getBasePtr(), Ld->getMemOperand());
+ for (SDNode *User : StoredVal->users())
+ DCI.AddToWorklist(User);
+ DAG.ReplaceAllUsesWith(StoredVal, NewLoad);
+ }
+ return NewStore;
}
static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
@@ -53669,7 +53976,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
}
}
- if (SDValue R = narrowBitOpRMW(St, dl, DAG, Subtarget))
+ if (SDValue R = narrowBitOpRMW(St, dl, DAG, DCI, Subtarget))
return R;
// Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
@@ -54006,7 +54313,9 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
unsigned Opcode = N->getOpcode();
- bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
+ bool IsAdd =
+ (Opcode == ISD::FADD) || (Opcode == ISD::ADD) || (Opcode == ISD::SADDSAT);
+ bool IsSat = (Opcode == ISD::SADDSAT) || (Opcode == ISD::SSUBSAT);
SmallVector<int, 8> PostShuffleMask;
auto MergableHorizOp = [N](unsigned HorizOpcode) {
@@ -54036,11 +54345,17 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
break;
case ISD::ADD:
case ISD::SUB:
- if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
- VT == MVT::v16i16 || VT == MVT::v8i32)) {
+ case ISD::SADDSAT:
+ case ISD::SSUBSAT:
+ if (!Subtarget.hasSSSE3())
+ break;
+ if (VT == MVT::v8i16 || VT == MVT::v16i16 ||
+ (!IsSat && (VT == MVT::v4i32 || VT == MVT::v8i32))) {
+
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
- auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
+ auto HorizOpcode = IsSat ? (IsAdd ? X86ISD::HADDS : X86ISD::HSUBS)
+ : (IsAdd ? X86ISD::HADD : X86ISD::HSUB);
if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
PostShuffleMask, MergableHorizOp(HorizOpcode))) {
auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
@@ -54117,11 +54432,6 @@ static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
- return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
- Flags.hasAllowContract();
- };
-
auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
return DAG.getTarget().Options.NoSignedZerosFPMath ||
Flags.hasNoSignedZeros();
@@ -54134,7 +54444,7 @@ static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,
};
if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
- !AllowContract(N->getFlags()))
+ !N->getFlags().hasAllowContract())
return SDValue();
EVT VT = N->getValueType(0);
@@ -54145,14 +54455,13 @@ static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,
SDValue RHS = N->getOperand(1);
bool IsConj;
SDValue FAddOp1, MulOp0, MulOp1;
- auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
- &IsVectorAllNegativeZero,
+ auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &IsVectorAllNegativeZero,
&HasNoSignedZero](SDValue N) -> bool {
if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
return false;
SDValue Op0 = N.getOperand(0);
unsigned Opcode = Op0.getOpcode();
- if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
+ if (Op0.hasOneUse() && Op0->getFlags().hasAllowContract()) {
if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
MulOp0 = Op0.getOperand(0);
MulOp1 = Op0.getOperand(1);
@@ -54614,11 +54923,14 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
// Check the shift amount is byte aligned.
// Check the truncation doesn't use any shifted in (zero) top bits.
- // Check the shift amount doesn't depend on the original load.
+ // Check the shift amount doesn't depend on the original load chain.
if (KnownAmt.countMinTrailingZeros() >= 3 &&
KnownAmt.getMaxValue().ule(SrcVT.getSizeInBits() -
VT.getSizeInBits()) &&
- !Ld->isPredecessorOf(ShAmt.getNode())) {
+ none_of(Ld->uses(), [&ShAmt](SDUse &Use) {
+ return Use.getResNo() == 1 &&
+ Use.getUser()->isPredecessorOf(ShAmt.getNode());
+ })) {
EVT PtrVT = Ld->getBasePtr().getValueType();
SDValue PtrBitOfs = DAG.getZExtOrTrunc(ShAmt, DL, PtrVT);
SDValue PtrByteOfs =
@@ -54627,10 +54939,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
SDValue NewPtr = DAG.getMemBasePlusOffset(
Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap);
SDValue NewLoad =
- DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(),
+ DAG.getLoad(VT, DL, Ld->getChain(), NewPtr,
+ MachinePointerInfo(Ld->getPointerInfo().getAddrSpace()),
Align(), Ld->getMemOperand()->getFlags());
- DAG.ReplaceAllUsesOfValueWith(Src.getOperand(0).getValue(1),
- NewLoad.getValue(1));
+ DAG.makeEquivalentMemoryOrdering(Ld, NewLoad);
return NewLoad;
}
}
@@ -57400,6 +57712,40 @@ static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Combiner: turn uniform-constant splat funnel shifts into VSHLD/VSHRD
+static SDValue combineFunnelShift(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(N);
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ SDValue Amt = N->getOperand(2);
+ EVT VT = Op0.getValueType();
+
+ if (!VT.isVector())
+ return SDValue();
+
+ // Only combine if the operation is legal for this type.
+ // This ensures we don't try to convert types that need to be
+ // widened/promoted.
+ if (!DAG.getTargetLoweringInfo().isOperationLegal(N->getOpcode(), VT))
+ return SDValue();
+
+ unsigned EltSize = VT.getScalarSizeInBits();
+ APInt ShiftVal;
+ if (!X86::isConstantSplat(Amt, ShiftVal))
+ return SDValue();
+
+ uint64_t ModAmt = ShiftVal.urem(EltSize);
+ SDValue Imm = DAG.getTargetConstant(ModAmt, DL, MVT::i8);
+ bool IsFSHR = N->getOpcode() == ISD::FSHR;
+
+ if (IsFSHR)
+ std::swap(Op0, Op1);
+ unsigned Opcode = IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD;
+ return DAG.getNode(Opcode, DL, VT, {Op0, Op1, Imm});
+}
+
static bool needCarryOrOverflowFlag(SDValue Flags) {
assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
@@ -59086,7 +59432,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
case X86ISD::ANDNP:
// TODO: AVX512 targets should only use CombineSubOperand like AVX1/2.
if (!IsSplat && (VT.is256BitVector() ||
- (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
+ (VT.is512BitVector() && Subtarget.useAVX512Regs()) ||
+ (EltSizeInBits == 1 && TLI.isTypeLegal(VT)))) {
// Don't concatenate root AVX1 NOT patterns.
// TODO: Allow NOT folding if Concat0 succeeds.
if (Opcode == ISD::XOR && Depth == 0 && !Subtarget.hasInt256() &&
@@ -59096,7 +59443,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
break;
SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
- if (Concat0 || Concat1 || Subtarget.useAVX512Regs())
+ if (Concat0 || Concat1 ||
+ (EltSizeInBits != 1 && Subtarget.useAVX512Regs()))
return DAG.getNode(Opcode, DL, VT,
Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
@@ -59156,6 +59504,31 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
}
}
break;
+ case ISD::SETCC:
+ if (!IsSplat && EltSizeInBits == 1 &&
+ llvm::all_of(Ops, [Op0](SDValue Op) {
+ return Op0.getOperand(0).getValueType() ==
+ Op.getOperand(0).getValueType() &&
+ Op0.getOperand(2) == Op.getOperand(2);
+ })) {
+ EVT SrcVT = Op0.getOperand(0).getValueType();
+ EVT NewSrcVT = EVT::getVectorVT(Ctx, SrcVT.getScalarType(),
+ NumOps * SrcVT.getVectorNumElements());
+ unsigned SrcSizeInBits = SrcVT.getScalarSizeInBits();
+ if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(NewSrcVT) &&
+ (NewSrcVT.is256BitVector() ||
+ (NewSrcVT.is512BitVector() && Subtarget.useAVX512Regs() &&
+ (SrcSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
+ SDValue LHS = CombineSubOperand(NewSrcVT.getSimpleVT(), Ops, 0);
+ SDValue RHS = CombineSubOperand(NewSrcVT.getSimpleVT(), Ops, 1);
+ if (LHS || RHS)
+ return DAG.getNode(Opcode, DL, VT,
+ LHS ? LHS : ConcatSubOperand(NewSrcVT, Ops, 0),
+ RHS ? RHS : ConcatSubOperand(NewSrcVT, Ops, 1),
+ Op0.getOperand(2));
+ }
+ }
+ break;
case ISD::CTPOP:
case ISD::CTTZ:
case ISD::CTLZ:
@@ -59219,6 +59592,36 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
ConcatSubOperand(VT, Ops, 1));
}
break;
+ case ISD::FSQRT:
+ case ISD::FCEIL:
+ case ISD::FTRUNC:
+ case ISD::FRINT:
+ case ISD::FNEARBYINT:
+ case ISD::FROUND:
+ case ISD::FROUNDEVEN:
+ case ISD::FFLOOR:
+ if (!IsSplat && (VT.is256BitVector() ||
+ (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
+ return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
+ }
+ break;
+ case X86ISD::FRCP:
+ case X86ISD::FRSQRT:
+ if (!IsSplat && VT.is256BitVector()) {
+ return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
+ }
+ break;
+ case X86ISD::VRNDSCALE:
+ if (!IsSplat &&
+ (VT.is256BitVector() ||
+ (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
+ llvm::all_of(Ops, [Op0](SDValue Op) {
+ return Op0.getOperand(1) == Op.getOperand(1);
+ })) {
+ return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
+ Op0.getOperand(1));
+ }
+ break;
case X86ISD::HADD:
case X86ISD::HSUB:
case X86ISD::FHADD:
@@ -59350,8 +59753,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
*FirstLd->getMemOperand(), &Fast) &&
Fast) {
- if (SDValue Ld =
- EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
+ if (SDValue Ld = EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget,
+ false, Depth + 1))
return Ld;
}
}
@@ -59490,6 +59893,17 @@ static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG,
}
}
+ // Attempt to merge comparison/logic ops if the type is legal.
+ if (TLI.isTypeLegal(VT) &&
+ (all_of(Ops, [](SDValue Op) { return Op.getOpcode() == ISD::SETCC; }) ||
+ all_of(Ops, [](SDValue Op) {
+ return ISD::isBitwiseLogicOp(Op.getOpcode());
+ }))) {
+ if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops,
+ DAG, Subtarget))
+ return R;
+ }
+
// Don't do anything else for i1 vectors.
return SDValue();
}
@@ -60830,6 +61244,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
case X86ISD::ADD:
case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
+ case ISD::SADDSAT:
+ case ISD::SSUBSAT: return combineToHorizontalAddSub(N, DAG, Subtarget);
case X86ISD::CLOAD:
case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
case X86ISD::SBB: return combineSBB(N, DAG);
@@ -60953,6 +61369,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::VPERM2X128:
case X86ISD::SHUF128:
case X86ISD::VZEXT_MOVL:
+ case X86ISD::COMPRESS:
+ case X86ISD::EXPAND:
case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
case X86ISD::FMADD_RND:
case X86ISD::FMSUB:
@@ -61000,6 +61418,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI);
case ISD::FP_TO_SINT_SAT:
case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget);
+ case ISD::FSHL:
+ case ISD::FSHR: return combineFunnelShift(N, DAG, DCI, Subtarget);
// clang-format on
}
@@ -61554,8 +61974,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
(Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
- Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
- Op.getValueType());
+ Result = DAG.getSignedTargetConstant(C->getSExtValue(), SDLoc(Op),
+ Op.getValueType());
break;
}
}
@@ -61593,7 +62013,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
C->getSExtValue())) {
// Widen to 64 bits here to get it sign extended.
- Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
+ Result =
+ DAG.getSignedTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
break;
}
// FIXME gcc accepts some relocatable values here too, but only in certain
@@ -61642,9 +62063,11 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
BooleanContent BCont = getBooleanContents(MVT::i64);
ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
: ISD::SIGN_EXTEND;
- int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
- : CST->getSExtValue();
- Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
+ SDLoc DL(Op);
+ Result =
+ ExtOpc == ISD::ZERO_EXTEND
+ ? DAG.getTargetConstant(CST->getZExtValue(), DL, MVT::i64)
+ : DAG.getSignedTargetConstant(CST->getSExtValue(), DL, MVT::i64);
break;
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index b7151f6..848fe4b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -270,6 +270,10 @@ namespace llvm {
HADD,
HSUB,
+ /// Integer horizontal saturating add/sub.
+ HADDS,
+ HSUBS,
+
/// Floating point horizontal add/sub.
FHADD,
FHSUB,
@@ -1478,7 +1482,7 @@ namespace llvm {
/// to a MemIntrinsicNode (touches memory). If this is the case, it returns
/// true and stores the intrinsic information into the IntrinsicInfo that was
/// passed to the function.
- bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+ bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallBase &I,
MachineFunction &MF,
unsigned Intrinsic) const override;
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index a61bbe5..8db3e50 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -553,7 +553,7 @@ static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
static Constant* SegmentOffset(IRBuilderBase &IRB,
int Offset, unsigned AddressSpace) {
return ConstantExpr::getIntToPtr(
- ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
+ ConstantInt::getSigned(Type::getInt32Ty(IRB.getContext()), Offset),
IRB.getPtrTy(AddressSpace));
}
diff --git a/llvm/lib/Target/X86/X86InsertPrefetch.cpp b/llvm/lib/Target/X86/X86InsertPrefetch.cpp
deleted file mode 100644
index 953b755..0000000
--- a/llvm/lib/Target/X86/X86InsertPrefetch.cpp
+++ /dev/null
@@ -1,259 +0,0 @@
-//===------- X86InsertPrefetch.cpp - Insert cache prefetch hints ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass applies cache prefetch instructions based on a profile. The pass
-// assumes DiscriminateMemOps ran immediately before, to ensure debug info
-// matches the one used at profile generation time. The profile is encoded in
-// afdo format (text or binary). It contains prefetch hints recommendations.
-// Each recommendation is made in terms of debug info locations, a type (i.e.
-// nta, t{0|1|2}) and a delta. The debug info identifies an instruction with a
-// memory operand (see X86DiscriminateMemOps). The prefetch will be made for
-// a location at that memory operand + the delta specified in the
-// recommendation.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86.h"
-#include "X86Subtarget.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/ProfileData/SampleProf.h"
-#include "llvm/ProfileData/SampleProfReader.h"
-#include "llvm/Support/VirtualFileSystem.h"
-#include "llvm/Transforms/IPO/SampleProfile.h"
-using namespace llvm;
-using namespace sampleprof;
-
-static cl::opt<std::string>
- PrefetchHintsFile("prefetch-hints-file",
- cl::desc("Path to the prefetch hints profile. See also "
- "-x86-discriminate-memops"),
- cl::Hidden);
-namespace {
-
-class X86InsertPrefetch : public MachineFunctionPass {
- void getAnalysisUsage(AnalysisUsage &AU) const override;
- bool doInitialization(Module &) override;
-
- bool runOnMachineFunction(MachineFunction &MF) override;
- struct PrefetchInfo {
- unsigned InstructionID;
- int64_t Delta;
- };
- typedef SmallVectorImpl<PrefetchInfo> Prefetches;
- bool findPrefetchInfo(const FunctionSamples *Samples, const MachineInstr &MI,
- Prefetches &prefetches) const;
-
-public:
- static char ID;
- X86InsertPrefetch(const std::string &PrefetchHintsFilename);
- StringRef getPassName() const override {
- return "X86 Insert Cache Prefetches";
- }
-
-private:
- std::string Filename;
- std::unique_ptr<SampleProfileReader> Reader;
-};
-
-using PrefetchHints = SampleRecord::CallTargetMap;
-
-// Return any prefetching hints for the specified MachineInstruction. The hints
-// are returned as pairs (name, delta).
-ErrorOr<const PrefetchHints &>
-getPrefetchHints(const FunctionSamples *TopSamples, const MachineInstr &MI) {
- if (const auto &Loc = MI.getDebugLoc())
- if (const auto *Samples = TopSamples->findFunctionSamples(Loc))
- return Samples->findCallTargetMapAt(FunctionSamples::getOffset(Loc),
- Loc->getBaseDiscriminator());
- return std::error_code();
-}
-
-// The prefetch instruction can't take memory operands involving vector
-// registers.
-bool IsMemOpCompatibleWithPrefetch(const MachineInstr &MI, int Op) {
- Register BaseReg = MI.getOperand(Op + X86::AddrBaseReg).getReg();
- Register IndexReg = MI.getOperand(Op + X86::AddrIndexReg).getReg();
- return (BaseReg == 0 ||
- X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) ||
- X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg)) &&
- (IndexReg == 0 ||
- X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg) ||
- X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg));
-}
-
-} // end anonymous namespace
-
-//===----------------------------------------------------------------------===//
-// Implementation
-//===----------------------------------------------------------------------===//
-
-char X86InsertPrefetch::ID = 0;
-
-X86InsertPrefetch::X86InsertPrefetch(const std::string &PrefetchHintsFilename)
- : MachineFunctionPass(ID), Filename(PrefetchHintsFilename) {}
-
-/// Return true if the provided MachineInstruction has cache prefetch hints. In
-/// that case, the prefetch hints are stored, in order, in the Prefetches
-/// vector.
-bool X86InsertPrefetch::findPrefetchInfo(const FunctionSamples *TopSamples,
- const MachineInstr &MI,
- Prefetches &Prefetches) const {
- assert(Prefetches.empty() &&
- "Expected caller passed empty PrefetchInfo vector.");
-
- // There is no point to match prefetch hints if the profile is using MD5.
- if (FunctionSamples::UseMD5)
- return false;
-
- static constexpr std::pair<StringLiteral, unsigned> HintTypes[] = {
- {"_nta_", X86::PREFETCHNTA},
- {"_t0_", X86::PREFETCHT0},
- {"_t1_", X86::PREFETCHT1},
- {"_t2_", X86::PREFETCHT2},
- };
- static const char *SerializedPrefetchPrefix = "__prefetch";
-
- auto T = getPrefetchHints(TopSamples, MI);
- if (!T)
- return false;
- int16_t max_index = -1;
- // Convert serialized prefetch hints into PrefetchInfo objects, and populate
- // the Prefetches vector.
- for (const auto &S_V : *T) {
- StringRef Name = S_V.first.stringRef();
- if (Name.consume_front(SerializedPrefetchPrefix)) {
- int64_t D = static_cast<int64_t>(S_V.second);
- unsigned IID = 0;
- for (const auto &HintType : HintTypes) {
- if (Name.consume_front(HintType.first)) {
- IID = HintType.second;
- break;
- }
- }
- if (IID == 0)
- return false;
- uint8_t index = 0;
- Name.consumeInteger(10, index);
-
- if (index >= Prefetches.size())
- Prefetches.resize(index + 1);
- Prefetches[index] = {IID, D};
- max_index = std::max(max_index, static_cast<int16_t>(index));
- }
- }
- assert(max_index + 1 >= 0 &&
- "Possible overflow: max_index + 1 should be positive.");
- assert(static_cast<size_t>(max_index + 1) == Prefetches.size() &&
- "The number of prefetch hints received should match the number of "
- "PrefetchInfo objects returned");
- return !Prefetches.empty();
-}
-
-bool X86InsertPrefetch::doInitialization(Module &M) {
- if (Filename.empty())
- return false;
-
- LLVMContext &Ctx = M.getContext();
- // TODO: Propagate virtual file system into LLVM targets.
- auto FS = vfs::getRealFileSystem();
- ErrorOr<std::unique_ptr<SampleProfileReader>> ReaderOrErr =
- SampleProfileReader::create(Filename, Ctx, *FS);
- if (std::error_code EC = ReaderOrErr.getError()) {
- std::string Msg = "Could not open profile: " + EC.message();
- Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg,
- DiagnosticSeverity::DS_Warning));
- return false;
- }
- Reader = std::move(ReaderOrErr.get());
- Reader->read();
- return true;
-}
-
-void X86InsertPrefetch::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.setPreservesAll();
- MachineFunctionPass::getAnalysisUsage(AU);
-}
-
-bool X86InsertPrefetch::runOnMachineFunction(MachineFunction &MF) {
- if (!Reader)
- return false;
- const FunctionSamples *Samples = Reader->getSamplesFor(MF.getFunction());
- if (!Samples)
- return false;
-
- bool Changed = false;
-
- const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
- SmallVector<PrefetchInfo, 4> Prefetches;
- for (auto &MBB : MF) {
- for (auto MI = MBB.instr_begin(); MI != MBB.instr_end();) {
- auto Current = MI;
- ++MI;
-
- int Offset = X86II::getMemoryOperandNo(Current->getDesc().TSFlags);
- if (Offset < 0)
- continue;
- unsigned Bias = X86II::getOperandBias(Current->getDesc());
- int MemOpOffset = Offset + Bias;
- // FIXME(mtrofin): ORE message when the recommendation cannot be taken.
- if (!IsMemOpCompatibleWithPrefetch(*Current, MemOpOffset))
- continue;
- Prefetches.clear();
- if (!findPrefetchInfo(Samples, *Current, Prefetches))
- continue;
- assert(!Prefetches.empty() &&
- "The Prefetches vector should contain at least a value if "
- "findPrefetchInfo returned true.");
- for (auto &PrefInfo : Prefetches) {
- unsigned PFetchInstrID = PrefInfo.InstructionID;
- int64_t Delta = PrefInfo.Delta;
- const MCInstrDesc &Desc = TII->get(PFetchInstrID);
- MachineInstr *PFetch =
- MF.CreateMachineInstr(Desc, Current->getDebugLoc(), true);
- MachineInstrBuilder MIB(MF, PFetch);
-
- static_assert(X86::AddrBaseReg == 0 && X86::AddrScaleAmt == 1 &&
- X86::AddrIndexReg == 2 && X86::AddrDisp == 3 &&
- X86::AddrSegmentReg == 4,
- "Unexpected change in X86 operand offset order.");
-
- // This assumes X86::AddBaseReg = 0, {...}ScaleAmt = 1, etc.
- // FIXME(mtrofin): consider adding a:
- // MachineInstrBuilder::set(unsigned offset, op).
- MIB.addReg(Current->getOperand(MemOpOffset + X86::AddrBaseReg).getReg())
- .addImm(
- Current->getOperand(MemOpOffset + X86::AddrScaleAmt).getImm())
- .addReg(
- Current->getOperand(MemOpOffset + X86::AddrIndexReg).getReg())
- .addImm(Current->getOperand(MemOpOffset + X86::AddrDisp).getImm() +
- Delta)
- .addReg(Current->getOperand(MemOpOffset + X86::AddrSegmentReg)
- .getReg());
-
- if (!Current->memoperands_empty()) {
- MachineMemOperand *CurrentOp = *(Current->memoperands_begin());
- MIB.addMemOperand(MF.getMachineMemOperand(
- CurrentOp, CurrentOp->getOffset() + Delta, CurrentOp->getSize()));
- }
-
- // Insert before Current. This is because Current may clobber some of
- // the registers used to describe the input memory operand.
- MBB.insert(Current, PFetch);
- Changed = true;
- }
- }
- }
- return Changed;
-}
-
-FunctionPass *llvm::createX86InsertPrefetchPass() {
- return new X86InsertPrefetch(PrefetchHintsFile);
-}
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index 522782a..6b8b8f7 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -370,11 +370,11 @@ let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in {
multiclass m_tcvtrowd2ps {
let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in {
let SchedRW = [WriteSystem] in {
- def rri : Ii8<0x7, MRMSrcReg, (outs VR512:$dst),
+ def rti : Ii8<0x7, MRMSrcReg, (outs VR512:$dst),
(ins TILE:$src1, i32u8imm:$src2),
"tcvtrowd2ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[]>, TA,XS, EVEX, EVEX_V512;
- def rre : I<0x4A, MRMSrcReg4VOp3, (outs VR512:$dst),
+ def rte : I<0x4A, MRMSrcReg4VOp3, (outs VR512:$dst),
(ins TILE:$src1, GR32:$src2),
"tcvtrowd2ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[]>, T8,XS, EVEX, VVVV, EVEX_V512;
@@ -450,12 +450,12 @@ multiclass AMXAVX512_BASE<bits<8> Opcode1, bits<8> Opcode2, string Opstr,
Prefix P1, Prefix P2> {
let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode], SchedRW = [WriteSystem] in {
let OpPrefix = P1 in
- def rre : I<Opcode1, MRMSrcReg4VOp3, (outs VR512:$dst),
+ def rte : I<Opcode1, MRMSrcReg4VOp3, (outs VR512:$dst),
(ins TILE:$src1, GR32:$src2),
!strconcat(Opstr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[]>, EVEX, VVVV, EVEX_V512, T8;
let OpPrefix = P2 in
- def rri : Ii8<Opcode2, MRMSrcReg, (outs VR512:$dst),
+ def rti : Ii8<Opcode2, MRMSrcReg, (outs VR512:$dst),
(ins TILE:$src1, i32u8imm:$src2),
!strconcat(Opstr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[]>, EVEX, EVEX_V512, TA;
@@ -475,22 +475,22 @@ defm TCVTROWPS2PHL : AMXAVX512_BASE<0x6d, 0x77, "tcvtrowps2phl", PD, XD>;
defm TCVTROWPS2BF16H : AMXAVX512_BASE<0x6d, 0x07, "tcvtrowps2bf16h", XD, XD>;
defm TCVTROWPS2BF16L : AMXAVX512_BASE<0x6d, 0x77, "tcvtrowps2bf16l", XS, XS>;
-multiclass m_tilemovrow {
+multiclass AMXAVX512_TILEMOVE<bits<8> Opcode1, bits<8> Opcode2, string Opstr> {
let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in {
let SchedRW = [WriteSystem] in {
- def rri : Ii8<0x7, MRMSrcReg, (outs VR512:$dst),
+ def rti : Ii8<Opcode1, MRMSrcReg, (outs VR512:$dst),
(ins TILE:$src1, u8imm:$src2),
- "tilemovrow\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- []>, TA,PD, EVEX, EVEX_V512;
- def rre : I<0x4A, MRMSrcReg4VOp3, (outs VR512:$dst),
+ !strconcat(Opstr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, TA, PD, EVEX, EVEX_V512;
+ def rte : I<Opcode2, MRMSrcReg4VOp3, (outs VR512:$dst),
(ins TILE:$src1, GR32:$src2),
- "tilemovrow\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- []>, T8,PD, EVEX, VVVV, EVEX_V512;
+ !strconcat(Opstr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, T8, PD, EVEX, VVVV, EVEX_V512;
}
} // HasAMXAVX512, HasAVX10_2, In64BitMode
}
-defm TILEMOVROW : m_tilemovrow;
+defm TILEMOVROW : AMXAVX512_TILEMOVE<0x07, 0x4A, "tilemovrow">;
let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in {
let SchedRW = [WriteSystem] in {
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 1b748b7..e8fda82 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -300,6 +300,12 @@ def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "",
[(set VR512:$dst, (v16i32 immAllZerosV))]>;
def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "",
[(set VR512:$dst, (v16i32 immAllOnesV))]>;
+let AddedComplexity = 1, Predicates = [HasVLX] in {
+ def AVX512_128_SETALLONES : I<0, Pseudo, (outs VR128X:$dst), (ins),
+ "", [(set VR128X:$dst, (v4i32 immAllOnesV))]>;
+ def AVX512_256_SETALLONES : I<0, Pseudo, (outs VR256X:$dst), (ins),
+ "", [(set VR256X:$dst, (v8i32 immAllOnesV))]>;
+}
}
let Predicates = [HasAVX512] in {
@@ -3161,6 +3167,12 @@ multiclass avx512_mask_setop_w<SDPatternOperator Val> {
defm KSET0 : avx512_mask_setop_w<immAllZerosV>;
defm KSET1 : avx512_mask_setop_w<immAllOnesV>;
+// 8-bit mask set operations for AVX512DQ
+let Predicates = [HasDQI] in {
+ defm KSET0B : avx512_mask_setop<VK8, v8i1, immAllZerosV>;
+ defm KSET1B : avx512_mask_setop<VK8, v8i1, immAllOnesV>;
+}
+
// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
let Predicates = [HasAVX512] in {
def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>;
@@ -3173,6 +3185,34 @@ let Predicates = [HasAVX512] in {
def : Pat<(v1i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK1)>;
}
+// With AVX512DQ, use 8-bit operations for 8-bit masks to avoid setting upper
+// bits
+let Predicates = [HasDQI] in {
+ def : Pat<(v8i1 immAllZerosV), (KSET0B)>;
+ def : Pat<(v8i1 immAllOnesV), (KSET1B)>;
+}
+
+// Optimize bitconvert of all-ones constants to use kxnor instructions
+let Predicates = [HasDQI] in {
+ def : Pat<(v8i1(bitconvert(i8 255))), (KSET1B)>;
+ def : Pat<(v16i1(bitconvert(i16 255))), (COPY_TO_REGCLASS(KSET1B), VK16)>;
+}
+let Predicates = [HasBWI] in {
+ def : Pat<(v32i1(bitconvert(i32 -1))), (KSET1D)>;
+ def : Pat<(v64i1(bitconvert(i64 -1))), (KSET1Q)>;
+}
+// Submask patterns: lower N bits set in larger mask registers
+let Predicates = [HasBWI, HasDQI] in {
+ // v32i1 submasks
+ def : Pat<(v32i1(bitconvert(i32 255))), (COPY_TO_REGCLASS(KSET1B), VK32)>;
+ def : Pat<(v32i1(bitconvert(i32 65535))), (COPY_TO_REGCLASS(KSET1W), VK32)>;
+ // v64i1 submasks
+ def : Pat<(v64i1(bitconvert(i64 255))), (COPY_TO_REGCLASS(KSET1B), VK64)>;
+ def : Pat<(v64i1(bitconvert(i64 65535))), (COPY_TO_REGCLASS(KSET1W), VK64)>;
+ def : Pat<(v64i1(bitconvert(i64 4294967295))), (COPY_TO_REGCLASS(KSET1D),
+ VK64)>;
+}
+
// Patterns for kmask insert_subvector/extract_subvector to/from index=0
multiclass operation_subvector_mask_lowering<RegisterClass subRC, ValueType subVT,
RegisterClass RC, ValueType VT> {
diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
index 7d5d7cf..b1599f2 100644
--- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -150,7 +150,7 @@ let Uses = [EFLAGS], isCodeGenOnly = 1, ForceDisassemble = 1 in {
// SetZUCC and promoted SetCC instructions.
let Uses = [EFLAGS], isCodeGenOnly = 1, ForceDisassemble = 1,
- hasSideEffects = 0, Predicates = [In64BitMode], Predicates = [HasNDD] in {
+ hasSideEffects = 0, Predicates = [In64BitMode] in {
def SETZUCCr : I<0x40, MRMXrCC, (outs GR8:$dst), (ins ccode:$cond),
"setzu${cond}\t$dst", []>,
XD, ZU, NoCD8, Sched<[WriteSETCC]>;
@@ -167,6 +167,10 @@ let Uses = [EFLAGS], isCodeGenOnly = 1, ForceDisassemble = 1,
}
}
+let Predicates = [HasZU] in
+ def : Pat<(i32 (zext (X86setcc timm:$cond, EFLAGS))),
+ (INSERT_SUBREG (i32 (IMPLICIT_DEF)), (SETZUCCr ccode:$cond), sub_8bit)>;
+
// SALC is an undocumented instruction. Information for this instruction can be found
// here http://www.rcollins.org/secrets/opcodes/SALC.html
// Set AL if carry.
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 5321ecf..0803a49 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -71,6 +71,8 @@ def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>;
def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>;
def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>;
def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>;
+def X86hadds : SDNode<"X86ISD::HADDS", SDTIntBinOp>;
+def X86hsubs : SDNode<"X86ISD::HSUBS", SDTIntBinOp>;
def X86comi : SDNode<"X86ISD::COMI", SDTX86FCmp>;
def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86FCmp>;
def X86comi512 : SDNode<"X86ISD::COMX", SDTX86FCmp>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 6b2a7a4..ebed733 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -85,7 +85,7 @@ static cl::opt<unsigned> UndefRegClearance(
void X86InstrInfo::anchor() {}
X86InstrInfo::X86InstrInfo(const X86Subtarget &STI)
- : X86GenInstrInfo(STI,
+ : X86GenInstrInfo(STI, RI,
(STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
: X86::ADJCALLSTACKDOWN32),
(STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
@@ -93,10 +93,9 @@ X86InstrInfo::X86InstrInfo(const X86Subtarget &STI)
X86::CATCHRET, (STI.is64Bit() ? X86::RET64 : X86::RET32)),
Subtarget(STI), RI(STI.getTargetTriple()) {}
-const TargetRegisterClass *
-X86InstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
- const TargetRegisterInfo *TRI) const {
- auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum, TRI);
+const TargetRegisterClass *X86InstrInfo::getRegClass(const MCInstrDesc &MCID,
+ unsigned OpNum) const {
+ auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum);
// If the target does not have egpr, then r16-r31 will be resereved for all
// instructions.
if (!RC || !Subtarget.hasEGPR())
@@ -779,6 +778,8 @@ bool X86InstrInfo::isReMaterializableImpl(
case X86::AVX512_128_SET0:
case X86::AVX512_256_SET0:
case X86::AVX512_512_SET0:
+ case X86::AVX512_128_SETALLONES:
+ case X86::AVX512_256_SETALLONES:
case X86::AVX512_512_SETALLONES:
case X86::AVX512_FsFLD0SD:
case X86::AVX512_FsFLD0SH:
@@ -789,9 +790,11 @@ bool X86InstrInfo::isReMaterializableImpl(
case X86::FsFLD0SS:
case X86::FsFLD0SH:
case X86::FsFLD0F128:
+ case X86::KSET0B:
case X86::KSET0D:
case X86::KSET0Q:
case X86::KSET0W:
+ case X86::KSET1B:
case X86::KSET1D:
case X86::KSET1Q:
case X86::KSET1W:
@@ -958,8 +961,7 @@ bool X86InstrInfo::isReMaterializableImpl(
void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
Register DestReg, unsigned SubIdx,
- const MachineInstr &Orig,
- const TargetRegisterInfo &TRI) const {
+ const MachineInstr &Orig) const {
bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
MachineBasicBlock::LQR_Dead) {
@@ -4294,10 +4296,11 @@ static unsigned CopyToFromAsymmetricReg(Register DestReg, Register SrcReg,
if (X86::VR128XRegClass.contains(DestReg) &&
X86::GR32RegClass.contains(SrcReg))
- // Copy from a VR128 register to a VR128 register.
+ // Copy from a GR32 register to a VR128 register.
return HasAVX512 ? X86::VMOVDI2PDIZrr
: HasAVX ? X86::VMOVDI2PDIrr
: X86::MOVDI2PDIrr;
+
return 0;
}
@@ -4366,6 +4369,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
else if (X86::VK16RegClass.contains(DestReg, SrcReg))
Opc = Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVQkk)
: (HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVWkk);
+
if (!Opc)
Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
@@ -4782,14 +4786,14 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB,
void X86InstrInfo::storeRegToStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
bool isKill, int FrameIdx, const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI, Register VReg,
- MachineInstr::MIFlag Flags) const {
+
+ Register VReg, MachineInstr::MIFlag Flags) const {
const MachineFunction &MF = *MBB.getParent();
const MachineFrameInfo &MFI = MF.getFrameInfo();
- assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
+ assert(MFI.getObjectSize(FrameIdx) >= RI.getSpillSize(*RC) &&
"Stack slot too small for store");
- unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
+ unsigned Alignment = std::max<uint32_t>(RI.getSpillSize(*RC), 16);
bool isAligned =
(Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
(RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
@@ -4803,15 +4807,17 @@ void X86InstrInfo::storeRegToStackSlot(
.setMIFlag(Flags);
}
-void X86InstrInfo::loadRegFromStackSlot(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
- int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
- Register VReg, MachineInstr::MIFlag Flags) const {
+void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ Register DestReg, int FrameIdx,
+ const TargetRegisterClass *RC,
+ Register VReg,
+ MachineInstr::MIFlag Flags) const {
const MachineFunction &MF = *MBB.getParent();
const MachineFrameInfo &MFI = MF.getFrameInfo();
- assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
+ assert(MFI.getObjectSize(FrameIdx) >= RI.getSpillSize(*RC) &&
"Load size exceeds stack slot");
- unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
+ unsigned Alignment = std::max<uint32_t>(RI.getSpillSize(*RC), 16);
bool isAligned =
(Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
(RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
@@ -5553,7 +5559,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
return false;
ShouldUpdateCC = true;
} else if (ImmDelta != 0) {
- unsigned BitWidth = TRI->getRegSizeInBits(*MRI->getRegClass(SrcReg));
+ unsigned BitWidth = RI.getRegSizeInBits(*MRI->getRegClass(SrcReg));
// Shift amount for min/max constants to adjust for 8/16/32 instruction
// sizes.
switch (OldCC) {
@@ -6244,9 +6250,31 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
return true;
}
+ case X86::AVX512_128_SETALLONES:
+ case X86::AVX512_256_SETALLONES:
case X86::AVX512_512_SETALLONES: {
Register Reg = MIB.getReg(0);
- MIB->setDesc(get(X86::VPTERNLOGDZrri));
+ unsigned Opc;
+ switch (MI.getOpcode()) {
+ case X86::AVX512_128_SETALLONES: {
+ if (X86::VR128RegClass.contains(Reg))
+ return Expand2AddrUndef(MIB, get(X86::VPCMPEQDrr));
+
+ Opc = X86::VPTERNLOGDZ128rri;
+ break;
+ }
+ case X86::AVX512_256_SETALLONES: {
+ if (X86::VR256RegClass.contains(Reg))
+ return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
+
+ Opc = X86::VPTERNLOGDZ256rri;
+ break;
+ }
+ case X86::AVX512_512_SETALLONES:
+ Opc = X86::VPTERNLOGDZrri;
+ break;
+ }
+ MIB->setDesc(get(Opc));
// VPTERNLOGD needs 3 register inputs and an immediate.
// 0xff will return 1s for any input.
MIB.addReg(Reg, RegState::Undef)
@@ -6352,12 +6380,16 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
// registers, since it is not usable as a write mask.
// FIXME: A more advanced approach would be to choose the best input mask
// register based on context.
+ case X86::KSET0B:
+ return Expand2AddrKreg(MIB, get(X86::KXORBkk), X86::K0);
case X86::KSET0W:
return Expand2AddrKreg(MIB, get(X86::KXORWkk), X86::K0);
case X86::KSET0D:
return Expand2AddrKreg(MIB, get(X86::KXORDkk), X86::K0);
case X86::KSET0Q:
return Expand2AddrKreg(MIB, get(X86::KXORQkk), X86::K0);
+ case X86::KSET1B:
+ return Expand2AddrKreg(MIB, get(X86::KXNORBkk), X86::K0);
case X86::KSET1W:
return Expand2AddrKreg(MIB, get(X86::KXNORWkk), X86::K0);
case X86::KSET1D:
@@ -7235,7 +7267,6 @@ static void updateOperandRegConstraints(MachineFunction &MF,
MachineInstr &NewMI,
const TargetInstrInfo &TII) {
MachineRegisterInfo &MRI = MF.getRegInfo();
- const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
for (int Idx : llvm::seq<int>(0, NewMI.getNumOperands())) {
MachineOperand &MO = NewMI.getOperand(Idx);
@@ -7247,7 +7278,7 @@ static void updateOperandRegConstraints(MachineFunction &MF,
continue;
auto *NewRC =
- MRI.constrainRegClass(Reg, TII.getRegClass(NewMI.getDesc(), Idx, &TRI));
+ MRI.constrainRegClass(Reg, TII.getRegClass(NewMI.getDesc(), Idx));
if (!NewRC) {
LLVM_DEBUG(
dbgs() << "WARNING: Unable to update register constraint for operand "
@@ -7345,7 +7376,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
unsigned SrcIdx = (Imm >> 6) & 3;
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
- const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI);
+ const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
if ((Size == 0 || Size >= 16) && RCSize >= 16 &&
(MI.getOpcode() != X86::INSERTPSrri || Alignment >= Align(4))) {
@@ -7370,7 +7401,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
// TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
if (OpNum == 2) {
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
- const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI);
+ const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) {
unsigned NewOpCode =
@@ -7389,7 +7420,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
// table twice.
if (OpNum == 2) {
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
- const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI);
+ const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment < Align(16)) {
MachineInstr *NewMI =
@@ -7524,7 +7555,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
bool NarrowToMOV32rm = false;
if (Size) {
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
- const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI);
+ const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
// Check if it's safe to fold the load. If the size of the object is
// narrower than the load width, then it's not.
@@ -8118,9 +8149,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
RC == &X86::VK32WMRegClass || RC == &X86::VK64WMRegClass;
};
- if (Op1.isReg() && IsVKWMClass(getRegClass(MCID, 1, &RI)))
+ if (Op1.isReg() && IsVKWMClass(getRegClass(MCID, 1)))
MaskReg = Op1.getReg();
- else if (Op2.isReg() && IsVKWMClass(getRegClass(MCID, 2, &RI)))
+ else if (Op2.isReg() && IsVKWMClass(getRegClass(MCID, 2)))
MaskReg = Op2.getReg();
if (MaskReg) {
@@ -8185,6 +8216,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
case X86::AVX1_SETALLONES:
case X86::AVX_SET0:
case X86::AVX512_256_SET0:
+ case X86::AVX512_256_SETALLONES:
Alignment = Align(32);
break;
case X86::V_SET0:
@@ -8192,6 +8224,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
case X86::AVX512_128_SET0:
case X86::FsFLD0F128:
case X86::AVX512_FsFLD0F128:
+ case X86::AVX512_128_SETALLONES:
Alignment = Align(16);
break;
case X86::MMX_SET0:
@@ -8250,6 +8283,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
case X86::AVX512_128_SET0:
case X86::AVX512_256_SET0:
case X86::AVX512_512_SET0:
+ case X86::AVX512_128_SETALLONES:
+ case X86::AVX512_256_SETALLONES:
case X86::AVX512_512_SETALLONES:
case X86::FsFLD0SH:
case X86::AVX512_FsFLD0SH:
@@ -8310,6 +8345,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
break;
case X86::AVX1_SETALLONES:
case X86::AVX2_SETALLONES:
+ case X86::AVX512_256_SETALLONES:
IsAllOnes = true;
[[fallthrough]];
case X86::AVX512_256_SET0:
@@ -8323,6 +8359,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
2);
break;
case X86::V_SETALLONES:
+ case X86::AVX512_128_SETALLONES:
IsAllOnes = true;
[[fallthrough]];
case X86::V_SET0:
@@ -8524,7 +8561,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
const MCInstrDesc &MCID = get(Opc);
- const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI);
+ const TargetRegisterClass *RC = getRegClass(MCID, Index);
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
// TODO: Check if 32-byte or greater accesses are slow too?
if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass &&
@@ -8635,7 +8672,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
// Emit the store instruction.
if (UnfoldStore) {
- const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI);
+ const TargetRegisterClass *DstRC = getRegClass(MCID, 0);
auto MMOs = extractStoreMMOs(MI.memoperands(), MF);
unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16);
bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
@@ -8667,7 +8704,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
const MCInstrDesc &MCID = get(Opc);
MachineFunction &MF = DAG.getMachineFunction();
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
- const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI);
+ const TargetRegisterClass *RC = getRegClass(MCID, Index);
unsigned NumDefs = MCID.NumDefs;
std::vector<SDValue> AddrOps;
std::vector<SDValue> BeforeOps;
@@ -8718,7 +8755,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
std::vector<EVT> VTs;
const TargetRegisterClass *DstRC = nullptr;
if (MCID.getNumDefs() > 0) {
- DstRC = getRegClass(MCID, 0, &RI);
+ DstRC = getRegClass(MCID, 0);
VTs.push_back(*TRI.legalclasstypes_begin(*DstRC));
}
for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 5f75559..a547fcd 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -246,9 +246,8 @@ public:
/// GR*RegClass (definition in TD file)
/// ->
/// GR*_NOREX2RegClass (Returned register class)
- const TargetRegisterClass *
- getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
- const TargetRegisterInfo *TRI) const override;
+ const TargetRegisterClass *getRegClass(const MCInstrDesc &MCID,
+ unsigned OpNum) const override;
/// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
/// such, whenever a client has an instance of instruction info, it should
@@ -343,8 +342,7 @@ public:
bool isReMaterializableImpl(const MachineInstr &MI) const override;
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
Register DestReg, unsigned SubIdx,
- const MachineInstr &Orig,
- const TargetRegisterInfo &TRI) const override;
+ const MachineInstr &Orig) const override;
/// Given an operand within a MachineInstr, insert preceding code to put it
/// into the right format for a particular kind of LEA instruction. This may
@@ -469,14 +467,14 @@ public:
bool RenamableSrc = false) const override;
void storeRegToStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
- bool isKill, int FrameIndex, const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI, Register VReg,
+ bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
void loadRegFromStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
int FrameIndex, const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI, Register VReg,
+
+ Register VReg,
MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
void loadStoreTileReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 806b02b9..e4aaa1e 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -4864,12 +4864,12 @@ let isCommutable = 0 in {
defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd",
int_x86_ssse3_psign_d_128,
SchedWriteVecALU.XMM, load, 0>, VEX, VVVV, WIG;
- defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw",
- int_x86_ssse3_phadd_sw_128,
- SchedWritePHAdd.XMM, load, 0>, VEX, VVVV, WIG;
- defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw",
- int_x86_ssse3_phsub_sw_128,
- SchedWritePHAdd.XMM, load, 0>, VEX, VVVV, WIG;
+ defm VPHADDSW : SS3I_binop_rm<0x03, "vphaddsw", X86hadds, v8i16, v8i16, VR128,
+ load, i128mem,
+ SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG;
+ defm VPHSUBSW : SS3I_binop_rm<0x07, "vphsubsw", X86hsubs, v8i16, v8i16, VR128,
+ load, i128mem,
+ SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG;
}
}
@@ -4907,12 +4907,12 @@ let isCommutable = 0 in {
SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG;
defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG;
- defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
- int_x86_avx2_phadd_sw,
- SchedWritePHAdd.YMM>, VEX, VVVV, VEX_L, WIG;
- defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
- int_x86_avx2_phsub_sw,
- SchedWritePHAdd.YMM>, VEX, VVVV, VEX_L, WIG;
+ defm VPHADDSWY : SS3I_binop_rm<0x03, "vphaddsw", X86hadds, v16i16, v16i16,
+ VR256, load, i256mem,
+ SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG;
+ defm VPHSUBSWY : SS3I_binop_rm<0x07, "vphsubsw", X86hsubs, v16i16, v16i16,
+ VR256, load, i256mem,
+ SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG;
}
}
@@ -4935,12 +4935,10 @@ let isCommutable = 0 in {
SchedWriteVecALU.XMM, memop>;
defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
memop, i128mem, SchedWriteVarShuffle.XMM>;
- defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw",
- int_x86_ssse3_phadd_sw_128,
- SchedWritePHAdd.XMM, memop>;
- defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw",
- int_x86_ssse3_phsub_sw_128,
- SchedWritePHAdd.XMM, memop>;
+ defm PHADDSW : SS3I_binop_rm<0x03, "phaddsw", X86hadds, v8i16, v8i16, VR128,
+ memop, i128mem, SchedWritePHAdd.XMM>;
+ defm PHSUBSW : SS3I_binop_rm<0x07, "phsubsw", X86hsubs, v8i16, v8i16, VR128,
+ memop, i128mem, SchedWritePHAdd.XMM>;
defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
v16i8, VR128, memop, i128mem,
SchedWriteVecIMul.XMM>;
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 0f725a8..88ade87 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -534,7 +534,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx10_mask_vcvttpd2qqs_round_512, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2SIS, X86ISD::CVTTP2SIS_SAE),
X86_INTRINSIC_DATA(avx10_mask_vcvttpd2udqs_128, CVTPD2DQ_MASK,
- X86ISD::CVTTP2UIS, X86ISD::MCVTTP2SIS),
+ X86ISD::CVTTP2UIS, X86ISD::MCVTTP2UIS),
X86_INTRINSIC_DATA(avx10_mask_vcvttpd2udqs_256, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2UIS, 0),
X86_INTRINSIC_DATA(avx10_mask_vcvttpd2udqs_round_512, INTR_TYPE_1OP_MASK,
@@ -724,8 +724,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0),
X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0),
X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
+ X86_INTRINSIC_DATA(avx2_phadd_sw, INTR_TYPE_2OP, X86ISD::HADDS, 0),
X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0),
X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+ X86_INTRINSIC_DATA(avx2_phsub_sw, INTR_TYPE_2OP, X86ISD::HSUBS, 0),
X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0),
X86_INTRINSIC_DATA(avx2_pmadd_ub_sw, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
X86_INTRINSIC_DATA(avx2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
@@ -2017,11 +2019,13 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(ssse3_phadd_d, INTR_TYPE_CAST_MMX, 0, 0),
X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
X86_INTRINSIC_DATA(ssse3_phadd_sw, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(ssse3_phadd_sw_128, INTR_TYPE_2OP, X86ISD::HADDS, 0),
X86_INTRINSIC_DATA(ssse3_phadd_w, INTR_TYPE_CAST_MMX, 0, 0),
X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
X86_INTRINSIC_DATA(ssse3_phsub_d, INTR_TYPE_CAST_MMX, 0, 0),
X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
X86_INTRINSIC_DATA(ssse3_phsub_sw, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(ssse3_phsub_sw_128, INTR_TYPE_2OP, X86ISD::HSUBS, 0),
X86_INTRINSIC_DATA(ssse3_phsub_w, INTR_TYPE_CAST_MMX, 0, 0),
X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw, INTR_TYPE_CAST_MMX, 0, 0),
diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
index 090060e..3b96e70 100644
--- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
@@ -115,9 +115,9 @@ struct MachineGadgetGraph : ImmutableGraph<MachineInstr *, int> {
static constexpr MachineInstr *const ArgNodeSentinel = nullptr;
using GraphT = ImmutableGraph<MachineInstr *, int>;
- using Node = typename GraphT::Node;
- using Edge = typename GraphT::Edge;
- using size_type = typename GraphT::size_type;
+ using Node = GraphT::Node;
+ using Edge = GraphT::Edge;
+ using size_type = GraphT::size_type;
MachineGadgetGraph(std::unique_ptr<Node[]> Nodes,
std::unique_ptr<Edge[]> Edges, size_type NodesSize,
size_type EdgesSize, int NumFences = 0, int NumGadgets = 0)
@@ -191,10 +191,10 @@ template <>
struct DOTGraphTraits<MachineGadgetGraph *> : DefaultDOTGraphTraits {
using GraphType = MachineGadgetGraph;
using Traits = llvm::GraphTraits<GraphType *>;
- using NodeRef = typename Traits::NodeRef;
- using EdgeRef = typename Traits::EdgeRef;
- using ChildIteratorType = typename Traits::ChildIteratorType;
- using ChildEdgeIteratorType = typename Traits::ChildEdgeIteratorType;
+ using NodeRef = Traits::NodeRef;
+ using EdgeRef = Traits::EdgeRef;
+ using ChildIteratorType = Traits::ChildIteratorType;
+ using ChildEdgeIteratorType = Traits::ChildEdgeIteratorType;
DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
@@ -227,9 +227,6 @@ struct DOTGraphTraits<MachineGadgetGraph *> : DefaultDOTGraphTraits {
} // end namespace llvm
-constexpr MachineInstr *MachineGadgetGraph::ArgNodeSentinel;
-constexpr int MachineGadgetGraph::GadgetEdgeSentinel;
-
char X86LoadValueInjectionLoadHardeningPass::ID = 0;
void X86LoadValueInjectionLoadHardeningPass::getAnalysisUsage(
@@ -335,7 +332,7 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph(
L.computePhiInfo();
GraphBuilder Builder;
- using GraphIter = typename GraphBuilder::BuilderNodeRef;
+ using GraphIter = GraphBuilder::BuilderNodeRef;
DenseMap<MachineInstr *, GraphIter> NodeMap;
int FenceCount = 0, GadgetCount = 0;
auto MaybeAddNode = [&NodeMap, &Builder](MachineInstr *MI) {
diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
index 7f33939..662aec2 100644
--- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
@@ -23,12 +23,15 @@
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Analysis.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
@@ -40,7 +43,7 @@
using namespace llvm;
using namespace PatternMatch;
-#define DEBUG_TYPE "lower-amx-intrinsics"
+#define DEBUG_TYPE "x86-lower-amx-intrinsics"
#ifndef NDEBUG
static bool isV256I32Ty(Type *Ty) {
@@ -627,6 +630,37 @@ bool X86LowerAMXIntrinsics::visit() {
}
namespace {
+bool shouldRunLowerAMXIntrinsics(const Function &F, const TargetMachine *TM) {
+ return X86ScalarizeAMX && (F.hasFnAttribute(Attribute::OptimizeNone) ||
+ TM->getOptLevel() == CodeGenOptLevel::None);
+}
+
+bool runLowerAMXIntrinsics(Function &F, DominatorTree *DT, LoopInfo *LI) {
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+
+ X86LowerAMXIntrinsics LAT(F, DTU, LI);
+ return LAT.visit();
+}
+} // namespace
+
+PreservedAnalyses X86LowerAMXIntrinsicsPass::run(Function &F,
+ FunctionAnalysisManager &FAM) {
+ if (!shouldRunLowerAMXIntrinsics(F, TM))
+ return PreservedAnalyses::all();
+
+ DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+ LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
+ bool Changed = runLowerAMXIntrinsics(F, &DT, &LI);
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA = PreservedAnalyses::none();
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<LoopAnalysis>();
+ return PA;
+}
+
+namespace {
class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass {
public:
static char ID;
@@ -634,21 +668,15 @@ public:
X86LowerAMXIntrinsicsLegacyPass() : FunctionPass(ID) {}
bool runOnFunction(Function &F) override {
- if (!X86ScalarizeAMX)
- return false;
TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
- if (!F.hasFnAttribute(Attribute::OptimizeNone) &&
- TM->getOptLevel() != CodeGenOptLevel::None)
+ if (!shouldRunLowerAMXIntrinsics(F, TM))
return false;
auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
- DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
-
- X86LowerAMXIntrinsics LAT(F, DTU, LI);
- return LAT.visit();
+ return runLowerAMXIntrinsics(F, DT, LI);
}
StringRef getPassName() const override { return "Lower AMX intrinsics"; }
@@ -668,6 +696,6 @@ INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_END(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName,
false, false)
-FunctionPass *llvm::createX86LowerAMXIntrinsicsPass() {
+FunctionPass *llvm::createX86LowerAMXIntrinsicsLegacyPass() {
return new X86LowerAMXIntrinsicsLegacyPass();
}
diff --git a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
index 167bed1..c964605 100644
--- a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -359,7 +359,7 @@ bool X86OptimizeLEAPass::chooseBestLEA(
// example MOV8mr_NOREX. We could constrain the register class of the LEA
// def to suit MI, however since this case is very rare and hard to
// reproduce in a test it's just more reliable to skip the LEA.
- if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg, TRI) !=
+ if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg) !=
MRI->getRegClass(DefMI->getOperand(0).getReg()))
continue;
diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp
index a25e4e0..898c83c 100644
--- a/llvm/lib/Target/X86/X86PartialReduction.cpp
+++ b/llvm/lib/Target/X86/X86PartialReduction.cpp
@@ -16,10 +16,12 @@
#include "X86TargetMachine.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Analysis.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Pass.h"
#include "llvm/Support/KnownBits.h"
@@ -30,39 +32,44 @@ using namespace llvm;
namespace {
-class X86PartialReduction : public FunctionPass {
+class X86PartialReduction {
+ const X86TargetMachine *TM;
const DataLayout *DL = nullptr;
const X86Subtarget *ST = nullptr;
public:
+ X86PartialReduction(const X86TargetMachine *TM) : TM(TM) {}
+ bool run(Function &F);
+
+private:
+ bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB);
+ bool trySADReplacement(Instruction *Op);
+};
+
+class X86PartialReductionLegacy : public FunctionPass {
+public:
static char ID; // Pass identification, replacement for typeid.
- X86PartialReduction() : FunctionPass(ID) { }
+ X86PartialReductionLegacy() : FunctionPass(ID) {}
- bool runOnFunction(Function &Fn) override;
+ bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
}
- StringRef getPassName() const override {
- return "X86 Partial Reduction";
- }
-
-private:
- bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB);
- bool trySADReplacement(Instruction *Op);
+ StringRef getPassName() const override { return "X86 Partial Reduction"; }
};
}
-FunctionPass *llvm::createX86PartialReductionPass() {
- return new X86PartialReduction();
+FunctionPass *llvm::createX86PartialReductionLegacyPass() {
+ return new X86PartialReductionLegacy();
}
-char X86PartialReduction::ID = 0;
+char X86PartialReductionLegacy::ID = 0;
-INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE,
- "X86 Partial Reduction", false, false)
+INITIALIZE_PASS(X86PartialReductionLegacy, DEBUG_TYPE, "X86 Partial Reduction",
+ false, false)
// This function should be aligned with detectExtMul() in X86ISelLowering.cpp.
static bool matchVPDPBUSDPattern(const X86Subtarget *ST, BinaryOperator *Mul,
@@ -494,17 +501,8 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) {
}
}
-bool X86PartialReduction::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
-
- auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
- if (!TPC)
- return false;
-
- auto &TM = TPC->getTM<X86TargetMachine>();
- ST = TM.getSubtargetImpl(F);
-
+bool X86PartialReduction::run(Function &F) {
+ ST = TM->getSubtargetImpl(F);
DL = &F.getDataLayout();
bool MadeChange = false;
@@ -540,3 +538,25 @@ bool X86PartialReduction::runOnFunction(Function &F) {
return MadeChange;
}
+
+bool X86PartialReductionLegacy::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ if (!TPC)
+ return false;
+
+ return X86PartialReduction(&TPC->getTM<X86TargetMachine>()).run(F);
+}
+
+PreservedAnalyses X86PartialReductionPass::run(Function &F,
+ FunctionAnalysisManager &FAM) {
+ bool Changed = X86PartialReduction(TM).run(F);
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA = PreservedAnalyses::none();
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def
index fc25d55..b80ad38 100644
--- a/llvm/lib/Target/X86/X86PassRegistry.def
+++ b/llvm/lib/Target/X86/X86PassRegistry.def
@@ -15,20 +15,23 @@
#ifndef FUNCTION_PASS
#define FUNCTION_PASS(NAME, CREATE_PASS)
#endif
+FUNCTION_PASS("x86-lower-amx-intrinsics", X86LowerAMXIntrinsicsPass(this))
FUNCTION_PASS("x86-lower-amx-type", X86LowerAMXTypePass(this))
+FUNCTION_PASS("x86-partial-reduction", X86PartialReductionPass(this))
#undef FUNCTION_PASS
#ifndef DUMMY_FUNCTION_PASS
#define DUMMY_FUNCTION_PASS(NAME, CREATE_PASS)
#endif
-DUMMY_FUNCTION_PASS("lower-amx-intrinsics", X86LowerAMXIntrinsics(*this))
-DUMMY_FUNCTION_PASS("x86-partial-reduction", X86PartialReduction())
DUMMY_FUNCTION_PASS("x86-winehstate", WinEHStatePass())
#undef DUMMY_FUNCTION_PASS
#ifndef MACHINE_FUNCTION_PASS
#define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)
#endif
+MACHINE_FUNCTION_PASS("x86-avoid-trailing-call", X86AvoidTrailingCallPass())
+MACHINE_FUNCTION_PASS("x86-dyn-alloca-expander", X86DynAllocaExpanderPass())
+MACHINE_FUNCTION_PASS("x86-fp-stackifier", X86FPStackifierPass())
MACHINE_FUNCTION_PASS("x86-isel", X86ISelDAGToDAGPass(*this))
#undef MACHINE_FUNCTION_PASS
@@ -36,13 +39,10 @@ MACHINE_FUNCTION_PASS("x86-isel", X86ISelDAGToDAGPass(*this))
#define DUMMY_MACHINE_FUNCTION_PASS(NAME, PASS_NAME)
#endif
DUMMY_MACHINE_FUNCTION_PASS("x86-avoid-SFB", X86AvoidSFBPass())
-DUMMY_MACHINE_FUNCTION_PASS("x86-avoid-trailing-call", X86AvoidTrailingCallPass())
DUMMY_MACHINE_FUNCTION_PASS("x86-cf-opt", X86CallFrameOptimization())
DUMMY_MACHINE_FUNCTION_PASS("x86-cmov-conversion", X86CmovConverterPass())
-DUMMY_MACHINE_FUNCTION_PASS("x86-codege", FPS())
DUMMY_MACHINE_FUNCTION_PASS("x86-compress-evex", CompressEVEXPass())
DUMMY_MACHINE_FUNCTION_PASS("x86-domain-reassignment", X86DomainReassignment())
-DUMMY_MACHINE_FUNCTION_PASS("x86-dyn-alloca-expander", X86DynAllocaExpander())
DUMMY_MACHINE_FUNCTION_PASS("x86-execution-domain-fix", X86ExecutionDomainFix())
DUMMY_MACHINE_FUNCTION_PASS("fastpretileconfig", X86FastPreTileConfig())
DUMMY_MACHINE_FUNCTION_PASS("fasttileconfig", X86FastTileConfig())
diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index e0b3b61..829a32e 100644
--- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -54,7 +54,6 @@
#include <cassert>
#include <iterator>
#include <optional>
-#include <utility>
using namespace llvm;
@@ -841,7 +840,7 @@ getRegClassForUnfoldedLoad(const X86InstrInfo &TII, unsigned Opcode) {
unsigned UnfoldedOpc = TII.getOpcodeAfterMemoryUnfold(
Opcode, /*UnfoldLoad*/ true, /*UnfoldStore*/ false, &Index);
const MCInstrDesc &MCID = TII.get(UnfoldedOpc);
- return TII.getRegClass(MCID, Index, &TII.getRegisterInfo());
+ return TII.getRegClass(MCID, Index);
}
void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index 66d9e74..61f288f 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -32,7 +32,6 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/TargetParser/Triple.h"
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 9a76abc..713df63 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -50,7 +50,6 @@
#include "llvm/Transforms/CFGuard.h"
#include <memory>
#include <optional>
-#include <string>
using namespace llvm;
@@ -77,7 +76,7 @@ extern "C" LLVM_C_ABI void LLVMInitializeX86Target() {
initializeFixupBWInstPassPass(PR);
initializeCompressEVEXPassPass(PR);
initializeFixupLEAPassPass(PR);
- initializeFPSPass(PR);
+ initializeX86FPStackifierLegacyPass(PR);
initializeX86FixupSetCCPassPass(PR);
initializeX86CallFrameOptimizationPass(PR);
initializeX86CmovConverterPassPass(PR);
@@ -90,14 +89,14 @@ extern "C" LLVM_C_ABI void LLVMInitializeX86Target() {
initializeX86ExecutionDomainFixPass(PR);
initializeX86DomainReassignmentPass(PR);
initializeX86AvoidSFBPassPass(PR);
- initializeX86AvoidTrailingCallPassPass(PR);
+ initializeX86AvoidTrailingCallLegacyPassPass(PR);
initializeX86SpeculativeLoadHardeningPassPass(PR);
initializeX86SpeculativeExecutionSideEffectSuppressionPass(PR);
initializeX86FlagsCopyLoweringPassPass(PR);
initializeX86LoadValueInjectionLoadHardeningPassPass(PR);
initializeX86LoadValueInjectionRetHardeningPassPass(PR);
initializeX86OptimizeLEAPassPass(PR);
- initializeX86PartialReductionPass(PR);
+ initializeX86PartialReductionLegacyPass(PR);
initializePseudoProbeInserterPass(PR);
initializeX86ReturnThunksPass(PR);
initializeX86DAGToDAGISelLegacyPass(PR);
@@ -105,7 +104,7 @@ extern "C" LLVM_C_ABI void LLVMInitializeX86Target() {
initializeX86AsmPrinterPass(PR);
initializeX86FixupInstTuningPassPass(PR);
initializeX86FixupVectorConstantsPassPass(PR);
- initializeX86DynAllocaExpanderPass(PR);
+ initializeX86DynAllocaExpanderLegacyPass(PR);
initializeX86SuppressAPXForRelocationPassPass(PR);
initializeX86WinEHUnwindV2Pass(PR);
}
@@ -422,14 +421,14 @@ void X86PassConfig::addIRPasses() {
// We add both pass anyway and when these two passes run, we skip the pass
// based on the option level and option attribute.
- addPass(createX86LowerAMXIntrinsicsPass());
+ addPass(createX86LowerAMXIntrinsicsLegacyPass());
addPass(createX86LowerAMXTypeLegacyPass());
TargetPassConfig::addIRPasses();
if (TM->getOptLevel() != CodeGenOptLevel::None) {
addPass(createInterleavedAccessPass());
- addPass(createX86PartialReductionPass());
+ addPass(createX86PartialReductionLegacyPass());
}
// Add passes that handle indirect branch removal and insertion of a retpoline
@@ -517,7 +516,7 @@ void X86PassConfig::addPreRegAlloc() {
addPass(createX86SpeculativeLoadHardeningPass());
addPass(createX86FlagsCopyLoweringPass());
- addPass(createX86DynAllocaExpander());
+ addPass(createX86DynAllocaExpanderLegacyPass());
if (getOptLevel() != CodeGenOptLevel::None)
addPass(createX86PreTileConfigPass());
@@ -532,7 +531,7 @@ void X86PassConfig::addMachineSSAOptimization() {
void X86PassConfig::addPostRegAlloc() {
addPass(createX86LowerTileCopyPass());
- addPass(createX86FloatingPointStackifierPass());
+ addPass(createX86FPStackifierLegacyPass());
// When -O0 is enabled, the Load Value Injection Hardening pass will fall back
// to using the Speculative Execution Side Effect Suppression pass for
// mitigation. This is to prevent slow downs due to
@@ -564,8 +563,6 @@ void X86PassConfig::addPreEmitPass() {
addPass(createX86FixupVectorConstants());
}
addPass(createX86CompressEVEXPass());
- addPass(createX86DiscriminateMemOpsPass());
- addPass(createX86InsertPrefetchPass());
addPass(createX86InsertX87waitPass());
}
@@ -589,7 +586,7 @@ void X86PassConfig::addPreEmitPass2() {
// Insert extra int3 instructions after trailing call instructions to avoid
// issues in the unwinder.
if (TT.isOSWindows() && TT.isX86_64())
- addPass(createX86AvoidTrailingCallPass());
+ addPass(createX86AvoidTrailingCallLegacyPass());
// Verify basic block incoming and outgoing cfa offset and register values and
// correct CFA calculation rule where needed by inserting appropriate CFI
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 3d8d0a23..9fb9791 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -5411,9 +5411,28 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
}
InstructionCost
-X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
- unsigned AddressSpace,
+X86TTIImpl::getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA,
+ TTI::TargetCostKind CostKind) const {
+ switch (MICA.getID()) {
+ case Intrinsic::masked_scatter:
+ case Intrinsic::masked_gather:
+ return getGatherScatterOpCost(MICA, CostKind);
+ case Intrinsic::masked_load:
+ case Intrinsic::masked_store:
+ return getMaskedMemoryOpCost(MICA, CostKind);
+ }
+ return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
+}
+
+InstructionCost
+X86TTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
TTI::TargetCostKind CostKind) const {
+ unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
+ : Instruction::Store;
+ Type *SrcTy = MICA.getDataType();
+ Align Alignment = MICA.getAlignment();
+ unsigned AddressSpace = MICA.getAddressSpace();
+
bool IsLoad = (Instruction::Load == Opcode);
bool IsStore = (Instruction::Store == Opcode);
@@ -6253,10 +6272,15 @@ InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
}
/// Calculate the cost of Gather / Scatter operation
-InstructionCost X86TTIImpl::getGatherScatterOpCost(
- unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
- Align Alignment, TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr) const {
+InstructionCost
+X86TTIImpl::getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA,
+ TTI::TargetCostKind CostKind) const {
+ bool IsLoad = MICA.getID() == Intrinsic::masked_gather ||
+ MICA.getID() == Intrinsic::vp_gather;
+ unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
+ Type *SrcVTy = MICA.getDataType();
+ const Value *Ptr = MICA.getPointer();
+ Align Alignment = MICA.getAlignment();
if ((Opcode == Instruction::Load &&
(!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
@@ -6265,8 +6289,7 @@ InstructionCost X86TTIImpl::getGatherScatterOpCost(
(!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
Align(Alignment)))))
- return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
- Alignment, CostKind, I);
+ return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
@@ -6317,7 +6340,8 @@ static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST) {
}
bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment,
- unsigned AddressSpace) const {
+ unsigned AddressSpace,
+ TTI::MaskKind MaskKind) const {
Type *ScalarTy = DataTy->getScalarType();
// The backend can't handle a single element vector w/o CFCMOV.
@@ -6330,7 +6354,8 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment,
}
bool X86TTIImpl::isLegalMaskedStore(Type *DataTy, Align Alignment,
- unsigned AddressSpace) const {
+ unsigned AddressSpace,
+ TTI::MaskKind MaskKind) const {
Type *ScalarTy = DataTy->getScalarType();
// The backend can't handle a single element vector w/o CFCMOV.
@@ -6562,7 +6587,7 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
bool X86TTIImpl::areTypesABICompatible(const Function *Caller,
const Function *Callee,
- const ArrayRef<Type *> &Types) const {
+ ArrayRef<Type *> Types) const {
if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
return false;
@@ -6647,10 +6672,12 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
LegalVT.getVectorNumElements());
InstructionCost MemOpCost;
bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
- if (UseMaskedMemOp)
- MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
- AddressSpace, CostKind);
- else
+ if (UseMaskedMemOp) {
+ unsigned IID = Opcode == Instruction::Load ? Intrinsic::masked_load
+ : Intrinsic::masked_store;
+ MemOpCost = getMaskedMemoryOpCost(
+ {IID, SingleMemOpTy, Alignment, AddressSpace}, CostKind);
+ } else
MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace,
CostKind);
@@ -7223,3 +7250,19 @@ bool X86TTIImpl::isProfitableToSinkOperands(Instruction *I,
return false;
}
+
+bool X86TTIImpl::useFastCCForInternalCall(Function &F) const {
+ bool HasEGPR = ST->hasEGPR();
+ const TargetMachine &TM = getTLI()->getTargetMachine();
+
+ for (User *U : F.users()) {
+ CallBase *CB = dyn_cast<CallBase>(U);
+ if (!CB || CB->getCalledOperand() != &F)
+ continue;
+ Function *CallerFunc = CB->getFunction();
+ if (TM.getSubtarget<X86Subtarget>(*CallerFunc).hasEGPR() != HasEGPR)
+ return false;
+ }
+
+ return true;
+}
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 133b366..4f67279 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -183,14 +183,12 @@ public:
TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
const Instruction *I = nullptr) const override;
InstructionCost
- getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
- unsigned AddressSpace,
- TTI::TargetCostKind CostKind) const override;
- InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
- const Value *Ptr, bool VariableMask,
- Align Alignment,
- TTI::TargetCostKind CostKind,
- const Instruction *I) const override;
+ getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA,
+ TTI::TargetCostKind CostKind) const override;
+ InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
+ TTI::TargetCostKind CostKind) const;
+ InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA,
+ TTI::TargetCostKind CostKind) const;
InstructionCost
getPointersChainCost(ArrayRef<const Value *> Ptrs, const Value *Base,
const TTI::PointersChainInfo &Info, Type *AccessTy,
@@ -268,10 +266,14 @@ public:
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
const TargetTransformInfo::LSRCost &C2) const override;
bool canMacroFuseCmp() const override;
- bool isLegalMaskedLoad(Type *DataType, Align Alignment,
- unsigned AddressSpace) const override;
- bool isLegalMaskedStore(Type *DataType, Align Alignment,
- unsigned AddressSpace) const override;
+ bool
+ isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace,
+ TTI::MaskKind MaskKind =
+ TTI::MaskKind::VariableOrConstantMask) const override;
+ bool
+ isLegalMaskedStore(Type *DataType, Align Alignment, unsigned AddressSpace,
+ TTI::MaskKind MaskKind =
+ TTI::MaskKind::VariableOrConstantMask) const override;
bool isLegalNTLoad(Type *DataType, Align Alignment) const override;
bool isLegalNTStore(Type *DataType, Align Alignment) const override;
bool isLegalBroadcastLoad(Type *ElementTy,
@@ -296,7 +298,7 @@ public:
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const override;
bool areTypesABICompatible(const Function *Caller, const Function *Callee,
- const ArrayRef<Type *> &Type) const override;
+ ArrayRef<Type *> Type) const override;
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override {
return ST->getMaxInlineSizeThreshold();
@@ -319,6 +321,8 @@ public:
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
Type *ScalarValTy) const override;
+ bool useFastCCForInternalCall(Function &F) const override;
+
private:
bool supportsGather() const;
InstructionCost getGSVectorCost(unsigned Opcode, TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/X86/X86VZeroUpper.cpp b/llvm/lib/Target/X86/X86VZeroUpper.cpp
index f6f7e92..2f28ab3 100644
--- a/llvm/lib/Target/X86/X86VZeroUpper.cpp
+++ b/llvm/lib/Target/X86/X86VZeroUpper.cpp
@@ -66,7 +66,7 @@ namespace {
MachineBasicBlock &MBB);
void addDirtySuccessor(MachineBasicBlock &MBB);
- using BlockExitState = enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY };
+ enum BlockExitState { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY };
static const char* getBlockExitStateName(BlockExitState ST);