aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrAtomics.td12
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrGISel.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.td2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td18
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h6
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp69
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h46
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td8
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td13
-rw-r--r--llvm/lib/Target/ARM/ARMAsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp5
-rw-r--r--llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp15
-rw-r--r--llvm/lib/Target/ARM/ARMSubtarget.h1
-rw-r--r--llvm/lib/Target/ARM/Thumb2InstrInfo.cpp10
-rw-r--r--llvm/lib/Target/ARM/Thumb2InstrInfo.h3
-rw-r--r--llvm/lib/Target/BPF/BTFDebug.cpp19
-rw-r--r--llvm/lib/Target/DirectX/DXILPrepare.cpp8
-rw-r--r--llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp40
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp3
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.cpp75
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrVSX.td30
-rw-r--r--llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp3
-rw-r--r--llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp5
-rw-r--r--llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp51
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp3
-rw-r--r--llvm/lib/Target/RISCV/RISCVCombine.td11
-rw-r--r--llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp229
-rw-r--r--llvm/lib/Target/RISCV/RISCVFeatures.td41
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp25
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.h1
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.cpp70
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoA.td80
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td8
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td34
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVCBufferAccess.cpp14
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp50
-rw-r--r--llvm/lib/Target/TargetMachine.cpp1
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp5
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td53
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp9
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX512.td2
-rw-r--r--llvm/lib/Target/X86/X86ScheduleZnver4.td110
44 files changed, 937 insertions, 265 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
index 31fcd63..5d9215d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -136,8 +136,8 @@ def : Pat<(f32 (bitconvert (i32 (relaxed_load<atomic_load_nonext_32>
(ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend))))),
(LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
def : Pat<(f32 (bitconvert (i32 (relaxed_load<atomic_load_nonext_32>
- (am_indexed32 GPR64sp:$Rn, uimm12s8:$offset))))),
- (LDRSui GPR64sp:$Rn, uimm12s8:$offset)>;
+ (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+ (LDRSui GPR64sp:$Rn, uimm12s4:$offset)>;
def : Pat<(f32 (bitconvert (i32 (relaxed_load<atomic_load_nonext_32>
(am_unscaled32 GPR64sp:$Rn, simm9:$offset))))),
(LDURSi GPR64sp:$Rn, simm9:$offset)>;
@@ -236,11 +236,11 @@ def : Pat<(relaxed_store<atomic_store_32>
def : Pat<(releasing_store<atomic_store_64> GPR64sp:$ptr, GPR64:$val),
(STLRX GPR64:$val, GPR64sp:$ptr)>;
def : Pat<(relaxed_store<atomic_store_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
- ro_Wextend16:$extend),
+ ro_Wextend64:$extend),
GPR64:$val),
(STRXroW GPR64:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
def : Pat<(relaxed_store<atomic_store_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
- ro_Xextend16:$extend),
+ ro_Xextend64:$extend),
GPR64:$val),
(STRXroX GPR64:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
def : Pat<(relaxed_store<atomic_store_64>
@@ -276,8 +276,8 @@ def : Pat<(relaxed_store<atomic_store_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
(i64 (bitconvert (f64 FPR64Op:$val)))),
(STRDroX FPR64Op:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
def : Pat<(relaxed_store<atomic_store_64>
- (am_indexed64 GPR64sp:$Rn, uimm12s4:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))),
- (STRDui FPR64Op:$val, GPR64sp:$Rn, uimm12s4:$offset)>;
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))),
+ (STRDui FPR64Op:$val, GPR64sp:$Rn, uimm12s8:$offset)>;
def : Pat<(relaxed_store<atomic_store_64>
(am_unscaled64 GPR64sp:$Rn, simm9:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))),
(STURDi FPR64Op:$val, GPR64sp:$Rn, simm9:$offset)>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index fe84193..30b7b03 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -507,7 +507,7 @@ let AddedComplexity = 19 in {
defm : VecROStoreLane64_0Pat<ro32, store, v2i32, i32, ssub, STRSroW, STRSroX>;
}
-def : Pat<(v8i8 (AArch64dup (i8 (load (am_indexed8 GPR64sp:$Rn))))),
+def : Pat<(v8i8 (AArch64dup (i8 (load GPR64sp:$Rn)))),
(LD1Rv8b GPR64sp:$Rn)>;
def : Pat<(v16i8 (AArch64dup (i8 (load GPR64sp:$Rn)))),
(LD1Rv16b GPR64sp:$Rn)>;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index ef974df..47144c7 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -993,7 +993,7 @@ def PPR_3b : PPRClass<0, 7> { // Restricted 3 bit SVE predicate register class.
let DecoderMethod = "DecodeSimpleRegisterClass<AArch64::PPRRegClassID, 0, 8>";
}
def PPR_p8to15 : PPRClass<8, 15> {
- let DecoderMethod = "DecodeSimpleRegisterClass<AArch64::PNRRegClassID, 8, 8>";
+ let DecoderMethod = "DecodeSimpleRegisterClass<AArch64::PPRRegClassID, 8, 8>";
}
def PPRMul2 : PPRClass<0, 14, 2>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index ea32748..1c8383c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1430,6 +1430,18 @@ def FeatureAddSubU64Insts
def FeatureMadU32Inst : SubtargetFeature<"mad-u32-inst", "HasMadU32Inst",
"true", "Has v_mad_u32 instruction">;
+def FeatureAddMinMaxInsts : SubtargetFeature<"add-min-max-insts",
+ "HasAddMinMaxInsts",
+ "true",
+ "Has v_add_{min|max}_{i|u}32 instructions"
+>;
+
+def FeaturePkAddMinMaxInsts : SubtargetFeature<"pk-add-min-max-insts",
+ "HasPkAddMinMaxInsts",
+ "true",
+ "Has v_pk_add_{min|max}_{i|u}16 instructions"
+>;
+
def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts",
"HasVMemToLDSLoad",
"true",
@@ -2115,6 +2127,8 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureLshlAddU64Inst,
FeatureAddSubU64Insts,
FeatureMadU32Inst,
+ FeatureAddMinMaxInsts,
+ FeaturePkAddMinMaxInsts,
FeatureLdsBarrierArriveAtomic,
FeatureSetPrioIncWgInst,
Feature45BitNumRecordsBufferResource,
@@ -2658,11 +2672,11 @@ def HasFmaakFmamkF64Insts :
def HasAddMinMaxInsts :
Predicate<"Subtarget->hasAddMinMaxInsts()">,
- AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+ AssemblerPredicate<(any_of FeatureAddMinMaxInsts)>;
def HasPkAddMinMaxInsts :
Predicate<"Subtarget->hasPkAddMinMaxInsts()">,
- AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+ AssemblerPredicate<(any_of FeaturePkAddMinMaxInsts)>;
def HasPkMinMax3Insts :
Predicate<"Subtarget->hasPkMinMax3Insts()">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 56807a4..54ba2f8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4835,6 +4835,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_perm_pk16_b4_u4:
case Intrinsic::amdgcn_perm_pk16_b6_u4:
case Intrinsic::amdgcn_perm_pk16_b8_u4:
+ case Intrinsic::amdgcn_add_max_i32:
+ case Intrinsic::amdgcn_add_max_u32:
+ case Intrinsic::amdgcn_add_min_i32:
+ case Intrinsic::amdgcn_add_min_u32:
+ case Intrinsic::amdgcn_pk_add_max_i16:
+ case Intrinsic::amdgcn_pk_add_max_u16:
+ case Intrinsic::amdgcn_pk_add_min_i16:
+ case Intrinsic::amdgcn_pk_add_min_u16:
return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 996b55f..02c5390 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -2086,7 +2086,7 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
(AMDGPUAtomicOptimizerStrategy != ScanOptions::None))
addPass(AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy));
- addPass(AtomicExpandPass(&TM));
+ addPass(AtomicExpandPass(TM));
if (TM.getOptLevel() > CodeGenOptLevel::None) {
addPass(AMDGPUPromoteAllocaPass(TM));
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index a466780..ac660d5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -277,6 +277,8 @@ protected:
bool HasLshlAddU64Inst = false;
bool HasAddSubU64Insts = false;
bool HasMadU32Inst = false;
+ bool HasAddMinMaxInsts = false;
+ bool HasPkAddMinMaxInsts = false;
bool HasPointSampleAccel = false;
bool HasLdsBarrierArriveAtomic = false;
bool HasSetPrioIncWgInst = false;
@@ -1567,10 +1569,10 @@ public:
bool hasIntMinMax64() const { return GFX1250Insts; }
// \returns true if the target has V_ADD_{MIN|MAX}_{I|U}32 instructions.
- bool hasAddMinMaxInsts() const { return GFX1250Insts; }
+ bool hasAddMinMaxInsts() const { return HasAddMinMaxInsts; }
// \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
- bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
+ bool hasPkAddMinMaxInsts() const { return HasPkAddMinMaxInsts; }
// \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
bool hasPkMinMax3Insts() const { return GFX1250Insts; }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2ff2d2f..d930a21 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10628,6 +10628,59 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
return false;
+ const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
+ this]() -> bool {
+ if (CmpValue != 0)
+ return false;
+
+ MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
+ if (!Def || Def->getParent() != CmpInstr.getParent())
+ return false;
+
+ const auto foldableSelect = [](MachineInstr *Def) -> bool {
+ if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 ||
+ Def->getOpcode() == AMDGPU::S_CSELECT_B64) {
+ bool Op1IsNonZeroImm =
+ Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0;
+ bool Op2IsZeroImm =
+ Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0;
+ if (Op1IsNonZeroImm && Op2IsZeroImm)
+ return true;
+ }
+ return false;
+ };
+
+ // For S_OP that set SCC = DST!=0, do the transformation
+ //
+ // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
+
+ // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
+ // for S_CSELECT* already has the same value that will be calculated by
+ // s_cmp_lg_*
+ //
+ // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
+ // imm), 0)
+ if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(Def))
+ return false;
+
+ MachineInstr *KillsSCC = nullptr;
+ for (MachineInstr &MI :
+ make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
+ if (MI.modifiesRegister(AMDGPU::SCC, &RI))
+ return false;
+ if (MI.killsRegister(AMDGPU::SCC, &RI))
+ KillsSCC = &MI;
+ }
+
+ if (MachineOperand *SccDef =
+ Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
+ SccDef->setIsDead(false);
+ if (KillsSCC)
+ KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
+ CmpInstr.eraseFromParent();
+ return true;
+ };
+
const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
this](int64_t ExpectedValue, unsigned SrcSize,
bool IsReversible, bool IsSigned) -> bool {
@@ -10702,16 +10755,20 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
return false;
- for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
- I != E; ++I) {
- if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
- I->killsRegister(AMDGPU::SCC, &RI))
+ MachineInstr *KillsSCC = nullptr;
+ for (MachineInstr &MI :
+ make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
+ if (MI.modifiesRegister(AMDGPU::SCC, &RI))
return false;
+ if (MI.killsRegister(AMDGPU::SCC, &RI))
+ KillsSCC = &MI;
}
MachineOperand *SccDef =
Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
SccDef->setIsDead(false);
+ if (KillsSCC)
+ KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
CmpInstr.eraseFromParent();
if (!MRI->use_nodbg_empty(DefReg)) {
@@ -10755,7 +10812,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
case AMDGPU::S_CMP_LG_I32:
case AMDGPU::S_CMPK_LG_U32:
case AMDGPU::S_CMPK_LG_I32:
- return optimizeCmpAnd(0, 32, true, false);
+ return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect();
case AMDGPU::S_CMP_GT_U32:
case AMDGPU::S_CMPK_GT_U32:
return optimizeCmpAnd(0, 32, false, false);
@@ -10763,7 +10820,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
case AMDGPU::S_CMPK_GT_I32:
return optimizeCmpAnd(0, 32, false, true);
case AMDGPU::S_CMP_LG_U64:
- return optimizeCmpAnd(0, 64, true, false);
+ return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect();
}
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e1d7a07..5fdedda 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -714,6 +714,52 @@ public:
}
}
+ static bool setsSCCifResultIsNonZero(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::S_ABSDIFF_I32:
+ case AMDGPU::S_ABS_I32:
+ case AMDGPU::S_AND_B32:
+ case AMDGPU::S_AND_B64:
+ case AMDGPU::S_ANDN2_B32:
+ case AMDGPU::S_ANDN2_B64:
+ case AMDGPU::S_ASHR_I32:
+ case AMDGPU::S_ASHR_I64:
+ case AMDGPU::S_BCNT0_I32_B32:
+ case AMDGPU::S_BCNT0_I32_B64:
+ case AMDGPU::S_BCNT1_I32_B32:
+ case AMDGPU::S_BCNT1_I32_B64:
+ case AMDGPU::S_BFE_I32:
+ case AMDGPU::S_BFE_I64:
+ case AMDGPU::S_BFE_U32:
+ case AMDGPU::S_BFE_U64:
+ case AMDGPU::S_LSHL_B32:
+ case AMDGPU::S_LSHL_B64:
+ case AMDGPU::S_LSHR_B32:
+ case AMDGPU::S_LSHR_B64:
+ case AMDGPU::S_NAND_B32:
+ case AMDGPU::S_NAND_B64:
+ case AMDGPU::S_NOR_B32:
+ case AMDGPU::S_NOR_B64:
+ case AMDGPU::S_NOT_B32:
+ case AMDGPU::S_NOT_B64:
+ case AMDGPU::S_OR_B32:
+ case AMDGPU::S_OR_B64:
+ case AMDGPU::S_ORN2_B32:
+ case AMDGPU::S_ORN2_B64:
+ case AMDGPU::S_QUADMASK_B32:
+ case AMDGPU::S_QUADMASK_B64:
+ case AMDGPU::S_WQM_B32:
+ case AMDGPU::S_WQM_B64:
+ case AMDGPU::S_XNOR_B32:
+ case AMDGPU::S_XNOR_B64:
+ case AMDGPU::S_XOR_B32:
+ case AMDGPU::S_XOR_B64:
+ return true;
+ default:
+ return false;
+ }
+ }
+
static bool isEXP(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::EXP;
}
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 42ec8ba..ee10190 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -775,10 +775,10 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in {
} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
let SubtargetPredicate = HasAddMinMaxInsts, isCommutable = 1, isReMaterializable = 1 in {
- defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP_I32_I32_I32_I32>;
- defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP_I32_I32_I32_I32>;
- defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP_I32_I32_I32_I32>;
- defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP_I32_I32_I32_I32>;
+ defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_max_i32>;
+ defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_max_u32>;
+ defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_min_i32>;
+ defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_min_u32>;
}
defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 6500fce..c4692b7 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -75,7 +75,7 @@ multiclass VOP3PInst<string OpName, VOPProfile P,
SDPatternOperator node = null_frag, bit IsDOT = 0> {
def NAME : VOP3P_Pseudo<OpName, P,
!if (P.HasModifiers,
- getVOP3PModPat<P, node, IsDOT, IsDOT>.ret,
+ getVOP3PModPat<P, node, !or(P.EnableClamp, IsDOT), IsDOT>.ret,
getVOP3Pat<P, node>.ret)>;
let SubtargetPredicate = isGFX11Plus in {
if P.HasExtVOP3DPP then
@@ -434,15 +434,16 @@ defm : MadFmaMixFP16Pats_t16<fma, V_FMA_MIX_BF16_t16>;
} // End SubtargetPredicate = HasFmaMixBF16Insts
def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> {
- let HasModifiers = 0;
+ let HasNeg = 0;
+ let EnableClamp = 1;
}
let isCommutable = 1, isReMaterializable = 1 in {
let SubtargetPredicate = HasPkAddMinMaxInsts in {
-defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile>;
-defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile>;
-defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile>;
-defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile>;
+defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_max_i16>;
+defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_max_u16>;
+defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_min_i16>;
+defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_min_u16>;
}
let SubtargetPredicate = HasPkMinMax3Insts in {
defm V_PK_MAX3_I16 : VOP3PInst<"v_pk_max3_i16", PK_ADD_MINMAX_Profile>;
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 1f773e2..3368a50 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -820,7 +820,7 @@ void ARMAsmPrinter::emitAttributes() {
auto *BTIValue = mdconst::extract_or_null<ConstantInt>(
SourceModule->getModuleFlag("branch-target-enforcement"));
- if (BTIValue && BTIValue->isOne()) {
+ if (BTIValue && !BTIValue->isZero()) {
// If "+pacbti" is used as an architecture extension,
// Tag_BTI_extension is emitted in
// ARMTargetStreamer::emitTargetAttributes().
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 35e1127..b1a668e 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1089,7 +1089,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
// Register based DivRem for AEABI (RTABI 4.2)
if (TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() ||
- TT.isTargetMuslAEABI() || TT.isOSWindows()) {
+ TT.isTargetMuslAEABI() || TT.isOSFuchsia() || TT.isOSWindows()) {
setOperationAction(ISD::SREM, MVT::i64, Custom);
setOperationAction(ISD::UREM, MVT::i64, Custom);
HasStandaloneRem = false;
@@ -1353,6 +1353,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
setOperationAction(ISD::FLOG10, MVT::f16, Promote);
setOperationAction(ISD::FLOG2, MVT::f16, Promote);
setOperationAction(ISD::LRINT, MVT::f16, Expand);
+ setOperationAction(ISD::LROUND, MVT::f16, Expand);
setOperationAction(ISD::FROUND, MVT::f16, Legal);
setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
@@ -20574,7 +20575,7 @@ static TargetLowering::ArgListTy getDivRemArgList(
SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
- Subtarget->isTargetWindows()) &&
+ Subtarget->isTargetFuchsia() || Subtarget->isTargetWindows()) &&
"Register-based DivRem lowering only");
unsigned Opcode = Op->getOpcode();
assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 406f4c1..597d311 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -1036,6 +1036,7 @@ bool LowOverheadLoop::ValidateLiveOuts() {
while (!Worklist.empty()) {
MachineInstr *MI = Worklist.pop_back_val();
if (MI->getOpcode() == ARM::MQPRCopy) {
+ LLVM_DEBUG(dbgs() << " Must generate copy as VMOV: " << *MI);
VMOVCopies.insert(MI);
MachineInstr *CopySrc =
RDI.getUniqueReachingMIDef(MI, MI->getOperand(1).getReg());
@@ -1045,6 +1046,20 @@ bool LowOverheadLoop::ValidateLiveOuts() {
LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI);
VMOVCopies.clear();
return false;
+ } else if (isVectorPredicated(MI)) {
+ // If this is a predicated instruction with merging semantics,
+ // check where it gets its false lanes from, if any.
+ int InactiveIdx = findVPTInactiveOperandIdx(*MI);
+ if (InactiveIdx != -1) {
+ SmallPtrSet<MachineInstr *, 2> Defs;
+ MachineInstr *FalseSrc = RDI.getUniqueReachingMIDef(
+ MI, MI->getOperand(InactiveIdx).getReg());
+ if (FalseSrc) {
+ LLVM_DEBUG(dbgs()
+ << " Must check source of false lanes for: " << *MI);
+ Worklist.push_back(FalseSrc);
+ }
+ }
}
}
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index b2d368e..4a0883c 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -343,6 +343,7 @@ public:
bool isTargetWatchOS() const { return TargetTriple.isWatchOS(); }
bool isTargetWatchABI() const { return TargetTriple.isWatchABI(); }
bool isTargetDriverKit() const { return TargetTriple.isDriverKit(); }
+ bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); }
bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
bool isTargetNetBSD() const { return TargetTriple.isOSNetBSD(); }
bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index 431ce38..f5653d4 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -805,6 +805,16 @@ int llvm::findFirstVPTPredOperandIdx(const MachineInstr &MI) {
return -1;
}
+int llvm::findVPTInactiveOperandIdx(const MachineInstr &MI) {
+ const MCInstrDesc &MCID = MI.getDesc();
+
+ for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i)
+ if (MCID.operands()[i].OperandType == ARM::OPERAND_VPRED_R)
+ return i + ARM::SUBOP_vpred_r_inactive;
+
+ return -1;
+}
+
ARMVCC::VPTCodes llvm::getVPTInstrPredicate(const MachineInstr &MI,
Register &PredReg) {
int PIdx = findFirstVPTPredOperandIdx(MI);
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
index 3ec3a621..1b0bf2d 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -90,6 +90,9 @@ inline ARMVCC::VPTCodes getVPTInstrPredicate(const MachineInstr &MI) {
Register PredReg;
return getVPTInstrPredicate(MI, PredReg);
}
+// Identify the input operand in an MVE predicated instruction which
+// contributes the values of any inactive vector lanes.
+int findVPTInactiveOperandIdx(const MachineInstr &MI);
// Recomputes the Block Mask of Instr, a VPT or VPST instruction.
// This rebuilds the block mask of the instruction depending on the predicates
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index 9b5fc9d..a652b7e 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -95,7 +95,24 @@ void BTFTypeDerived::completeType(BTFDebug &BDebug) {
return;
IsCompleted = true;
- BTFType.NameOff = BDebug.addString(Name);
+ switch (Kind) {
+ case BTF::BTF_KIND_PTR:
+ case BTF::BTF_KIND_CONST:
+ case BTF::BTF_KIND_VOLATILE:
+ case BTF::BTF_KIND_RESTRICT:
+ // Debug info might contain names for these types, but given that we want
+ // to keep BTF minimal and naming reference types doesn't bring any value
+ // (what matters is the completeness of the base type), we don't emit them.
+ //
+ // Furthermore, the Linux kernel refuses to load BPF programs that contain
+ // BTF with these types named:
+ // https://elixir.bootlin.com/linux/v6.17.1/source/kernel/bpf/btf.c#L2586
+ BTFType.NameOff = 0;
+ break;
+ default:
+ BTFType.NameOff = BDebug.addString(Name);
+ break;
+ }
if (NeedsFixup || !DTy)
return;
diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp
index c8866bf..42e90f0 100644
--- a/llvm/lib/Target/DirectX/DXILPrepare.cpp
+++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp
@@ -294,6 +294,14 @@ public:
if (NamedMDNode *RootSignature = M.getNamedMetadata("dx.rootsignatures"))
RootSignature->eraseFromParent();
+ // llvm.errno.tbaa was recently added but is not supported in LLVM 3.7 and
+ // causes all tests using the DXIL Validator to fail.
+ //
+ // This is a temporary fix and should be replaced with a whitelist once
+ // we have determined all metadata that the DXIL Validator allows
+ if (NamedMDNode *ErrNo = M.getNamedMetadata("llvm.errno.tbaa"))
+ ErrNo->eraseFromParent();
+
return true;
}
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index a94e131..54c8972 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -117,8 +117,10 @@ HexagonTargetLowering::initializeHVXLowering() {
setOperationAction(ISD::VECTOR_SHUFFLE, ByteW, Legal);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
- if (Subtarget.useHVX128BOps())
+ if (Subtarget.useHVX128BOps()) {
setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
+ setOperationAction(ISD::BITCAST, MVT::v64i1, Custom);
+ }
if (Subtarget.useHVX128BOps() && Subtarget.useHVXV68Ops() &&
Subtarget.useHVXFloatingPoint()) {
@@ -2024,13 +2026,9 @@ HexagonTargetLowering::LowerHvxBitcast(SDValue Op, SelectionDAG &DAG) const {
// Handle bitcast from i32, v2i16, and v4i8 to v32i1.
// Splat the input into a 32-element i32 vector, then AND each element
// with a unique bitmask to isolate individual bits.
- if (ResTy == MVT::v32i1 &&
- (ValTy == MVT::i32 || ValTy == MVT::v2i16 || ValTy == MVT::v4i8) &&
- Subtarget.useHVX128BOps()) {
- SDValue Val32 = Val;
- if (ValTy == MVT::v2i16 || ValTy == MVT::v4i8)
- Val32 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Val);
-
+ auto bitcastI32ToV32I1 = [&](SDValue Val32) {
+ assert(Val32.getValueType().getSizeInBits() == 32 &&
+ "Input must be 32 bits");
MVT VecTy = MVT::getVectorVT(MVT::i32, 32);
SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, dl, VecTy, Val32);
SmallVector<SDValue, 32> Mask;
@@ -2039,7 +2037,31 @@ HexagonTargetLowering::LowerHvxBitcast(SDValue Op, SelectionDAG &DAG) const {
SDValue MaskVec = DAG.getBuildVector(VecTy, dl, Mask);
SDValue Anded = DAG.getNode(ISD::AND, dl, VecTy, Splat, MaskVec);
- return DAG.getNode(HexagonISD::V2Q, dl, ResTy, Anded);
+ return DAG.getNode(HexagonISD::V2Q, dl, MVT::v32i1, Anded);
+ };
+ // === Case: v32i1 ===
+ if (ResTy == MVT::v32i1 &&
+ (ValTy == MVT::i32 || ValTy == MVT::v2i16 || ValTy == MVT::v4i8) &&
+ Subtarget.useHVX128BOps()) {
+ SDValue Val32 = Val;
+ if (ValTy == MVT::v2i16 || ValTy == MVT::v4i8)
+ Val32 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Val);
+ return bitcastI32ToV32I1(Val32);
+ }
+ // === Case: v64i1 ===
+ if (ResTy == MVT::v64i1 && ValTy == MVT::i64 && Subtarget.useHVX128BOps()) {
+ // Split i64 into lo/hi 32-bit halves.
+ SDValue Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Val);
+ SDValue HiShifted = DAG.getNode(ISD::SRL, dl, MVT::i64, Val,
+ DAG.getConstant(32, dl, MVT::i64));
+ SDValue Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, HiShifted);
+
+ // Reuse the same 32-bit logic twice.
+ SDValue LoRes = bitcastI32ToV32I1(Lo);
+ SDValue HiRes = bitcastI32ToV32I1(Hi);
+
+ // Concatenate into a v64i1 predicate.
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, LoRes, HiRes);
}
if (isHvxBoolTy(ResTy) && ValTy.isScalarInteger()) {
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index f7deeaf..ca4a655 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2614,6 +2614,9 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, Mask, VT, V1, V2, DAG,
Subtarget)))
return Result;
+ // Try to widen vectors to gain more optimization opportunities.
+ if (SDValue NewShuffle = widenShuffleMask(DL, Mask, VT, V1, V2, DAG))
+ return NewShuffle;
if ((Result =
lowerVECTOR_SHUFFLE_XVPERMI(DL, Mask, VT, V1, DAG, Subtarget)))
return Result;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 8bf0d11..17f04d0 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -442,14 +442,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
// If we're enabling GP optimizations, use hardware square root
- if (!Subtarget.hasFSQRT() &&
- !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() &&
- Subtarget.hasFRE()))
+ if (!Subtarget.hasFSQRT() && !(Subtarget.hasFRSQRTE() && Subtarget.hasFRE()))
setOperationAction(ISD::FSQRT, MVT::f64, Expand);
if (!Subtarget.hasFSQRT() &&
- !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() &&
- Subtarget.hasFRES()))
+ !(Subtarget.hasFRSQRTES() && Subtarget.hasFRES()))
setOperationAction(ISD::FSQRT, MVT::f32, Expand);
if (Subtarget.hasFCPSGN()) {
@@ -565,16 +562,15 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::BITCAST, MVT::i32, Legal);
setOperationAction(ISD::BITCAST, MVT::i64, Legal);
setOperationAction(ISD::BITCAST, MVT::f64, Legal);
- if (TM.Options.UnsafeFPMath) {
- setOperationAction(ISD::LRINT, MVT::f64, Legal);
- setOperationAction(ISD::LRINT, MVT::f32, Legal);
- setOperationAction(ISD::LLRINT, MVT::f64, Legal);
- setOperationAction(ISD::LLRINT, MVT::f32, Legal);
- setOperationAction(ISD::LROUND, MVT::f64, Legal);
- setOperationAction(ISD::LROUND, MVT::f32, Legal);
- setOperationAction(ISD::LLROUND, MVT::f64, Legal);
- setOperationAction(ISD::LLROUND, MVT::f32, Legal);
- }
+
+ setOperationAction(ISD::STRICT_LRINT, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_LRINT, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_LLRINT, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_LLRINT, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_LROUND, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_LROUND, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_LLROUND, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_LLROUND, MVT::f32, Custom);
} else {
setOperationAction(ISD::BITCAST, MVT::f32, Expand);
setOperationAction(ISD::BITCAST, MVT::i32, Expand);
@@ -1034,11 +1030,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
// The nearbyint variants are not allowed to raise the inexact exception
- // so we can only code-gen them with unsafe math.
- if (TM.Options.UnsafeFPMath) {
- setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
- }
+ // so we can only code-gen them with fpexcept.ignore.
+ setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f32, Custom);
setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
@@ -8911,11 +8905,13 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
// be lost at this stage, but is below the single-precision rounding
// position.
//
- // However, if -enable-unsafe-fp-math is in effect, accept double
+ // However, if afn is in effect, accept double
// rounding to avoid the extra overhead.
- if (Op.getValueType() == MVT::f32 &&
- !Subtarget.hasFPCVT() &&
- !DAG.getTarget().Options.UnsafeFPMath) {
+ // FIXME: Currently INT_TO_FP can't support fast math flags because
+ // of nneg flag, thus Op->getFlags().hasApproximateFuncs() is always
+ // false.
+ if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT() &&
+ !Op->getFlags().hasApproximateFuncs()) {
// Twiddle input to make sure the low 11 bits are zero. (If this
// is the case, we are guaranteed the value will fit into the 53 bit
@@ -12759,6 +12755,14 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerADDSUBO_CARRY(Op, DAG);
case ISD::UCMP:
return LowerUCMP(Op, DAG);
+ case ISD::STRICT_LRINT:
+ case ISD::STRICT_LLRINT:
+ case ISD::STRICT_LROUND:
+ case ISD::STRICT_LLROUND:
+ case ISD::STRICT_FNEARBYINT:
+ if (Op->getFlags().hasNoFPExcept())
+ return Op;
+ return SDValue();
}
}
@@ -13088,7 +13092,9 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
BuildMI(BB, dl, TII->get(StoreMnemonic))
.addReg(TmpReg).addReg(ptrA).addReg(ptrB);
BuildMI(BB, dl, TII->get(PPC::BCC))
- .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
+ .addImm(PPC::PRED_NE_MINUS)
+ .addReg(PPC::CR0)
+ .addMBB(loopMBB);
BB->addSuccessor(loopMBB);
BB->addSuccessor(exitMBB);
@@ -13346,7 +13352,7 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
.addReg(ZeroReg)
.addReg(PtrReg);
BuildMI(BB, dl, TII->get(PPC::BCC))
- .addImm(PPC::PRED_NE)
+ .addImm(PPC::PRED_NE_MINUS)
.addReg(PPC::CR0)
.addMBB(loopMBB);
BB->addSuccessor(loopMBB);
@@ -14177,7 +14183,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.addReg(dest)
.addReg(oldval);
BuildMI(BB, dl, TII->get(PPC::BCC))
- .addImm(PPC::PRED_NE)
+ .addImm(PPC::PRED_NE_MINUS)
.addReg(CrReg)
.addMBB(exitMBB);
BB->addSuccessor(loop2MBB);
@@ -14189,7 +14195,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.addReg(ptrA)
.addReg(ptrB);
BuildMI(BB, dl, TII->get(PPC::BCC))
- .addImm(PPC::PRED_NE)
+ .addImm(PPC::PRED_NE_MINUS)
.addReg(PPC::CR0)
.addMBB(loop1MBB);
BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
@@ -14730,8 +14736,8 @@ SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG,
}
unsigned PPCTargetLowering::combineRepeatedFPDivisors() const {
- // Note: This functionality is used only when unsafe-fp-math is enabled, and
- // on cores with reciprocal estimates (which are used when unsafe-fp-math is
+ // Note: This functionality is used only when arcp is enabled, and
+ // on cores with reciprocal estimates (which are used when arcp is
// enabled for division), this functionality is redundant with the default
// combiner logic (once the division -> reciprocal/multiply transformation
// has taken place). As a result, this matters more for older cores than for
@@ -18707,11 +18713,12 @@ bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const {
const Function *F = I->getFunction();
const DataLayout &DL = F->getDataLayout();
Type *Ty = User->getOperand(0)->getType();
+ bool AllowContract = I->getFastMathFlags().allowContract() &&
+ User->getFastMathFlags().allowContract();
- return !(
- isFMAFasterThanFMulAndFAdd(*F, Ty) &&
- isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
- (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath));
+ return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
+ isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
+ (AllowContract || Options.AllowFPOpFusion == FPOpFusion::Fast));
}
case Instruction::Load: {
// Don't break "store (load float*)" pattern, this pattern will be combined
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 979ba31..885bed6 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -2786,14 +2786,16 @@ def : Pat<(v2f64 (any_frint v2f64:$S)), (v2f64 (XVRDPIC $S))>;
// Rounding without exceptions (nearbyint). Due to strange tblgen behaviour,
// these need to be defined after the any_frint versions so ISEL will correctly
// add the chain to the strict versions.
-def : Pat<(f32 (fnearbyint f32:$S)),
+// TODO: Match strict fp rounding intrinsics with instructions like xsrdpiz when
+// rounding mode is propagated to CodeGen part.
+def : Pat<(f32 (strict_fnearbyint f32:$S)),
(f32 (COPY_TO_REGCLASS (XSRDPIC
(COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
-def : Pat<(f64 (fnearbyint f64:$S)),
+def : Pat<(f64 (strict_fnearbyint f64:$S)),
(f64 (XSRDPIC $S))>;
-def : Pat<(v2f64 (fnearbyint v2f64:$S)),
+def : Pat<(v2f64 (strict_fnearbyint v2f64:$S)),
(v2f64 (XVRDPIC $S))>;
-def : Pat<(v4f32 (fnearbyint v4f32:$S)),
+def : Pat<(v4f32 (strict_fnearbyint v4f32:$S)),
(v4f32 (XVRSPIC $S))>;
// Materialize a zero-vector of long long
@@ -3578,25 +3580,25 @@ def : Pat<(f64 (bitconvert i64:$S)),
(f64 (MTVSRD $S))>;
// Rounding to integer.
-def : Pat<(i64 (lrint f64:$S)),
+def : Pat<(i64 (strict_lrint f64:$S)),
(i64 (MFVSRD (FCTID $S)))>;
-def : Pat<(i64 (lrint f32:$S)),
+def : Pat<(i64 (strict_lrint f32:$S)),
(i64 (MFVSRD (FCTID (COPY_TO_REGCLASS $S, F8RC))))>;
-def : Pat<(i64 (llrint f64:$S)),
+def : Pat<(i64 (strict_llrint f64:$S)),
(i64 (MFVSRD (FCTID $S)))>;
-def : Pat<(i64 (llrint f32:$S)),
+def : Pat<(i64 (strict_llrint f32:$S)),
(i64 (MFVSRD (FCTID (COPY_TO_REGCLASS $S, F8RC))))>;
-def : Pat<(i64 (lround f64:$S)),
+def : Pat<(i64 (strict_lround f64:$S)),
(i64 (MFVSRD (FCTID (XSRDPI $S))))>;
-def : Pat<(i64 (lround f32:$S)),
+def : Pat<(i64 (strict_lround f32:$S)),
(i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
-def : Pat<(i32 (lround f64:$S)),
+def : Pat<(i32 (strict_lround f64:$S)),
(i32 (MFVSRWZ (FCTIW (XSRDPI $S))))>;
-def : Pat<(i32 (lround f32:$S)),
+def : Pat<(i32 (strict_lround f32:$S)),
(i32 (MFVSRWZ (FCTIW (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
-def : Pat<(i64 (llround f64:$S)),
+def : Pat<(i64 (strict_llround f64:$S)),
(i64 (MFVSRD (FCTID (XSRDPI $S))))>;
-def : Pat<(i64 (llround f32:$S)),
+def : Pat<(i64 (strict_llround f32:$S)),
(i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
// Alternate patterns for PPCmtvsrz where the output is v8i16 or v16i8 instead
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index e857b2d..edde7ac 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -2406,7 +2406,8 @@ ParseStatus RISCVAsmParser::parseVTypeI(OperandVector &Operands) {
}
bool RISCVAsmParser::generateVTypeError(SMLoc ErrorLoc) {
- if (STI->hasFeature(RISCV::FeatureStdExtZvfbfa))
+ if (STI->hasFeature(RISCV::FeatureStdExtZvfbfa) ||
+ STI->hasFeature(RISCV::FeatureVendorXSfvfbfexp16e))
return Error(
ErrorLoc,
"operand must be "
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index b8ec0bb..4bea4c4 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -654,7 +654,10 @@ static constexpr FeatureBitset XqciFeatureGroup = {
static constexpr FeatureBitset XSfVectorGroup = {
RISCV::FeatureVendorXSfvcp, RISCV::FeatureVendorXSfvqmaccdod,
RISCV::FeatureVendorXSfvqmaccqoq, RISCV::FeatureVendorXSfvfwmaccqqq,
- RISCV::FeatureVendorXSfvfnrclipxfqf, RISCV::FeatureVendorXSfmmbase};
+ RISCV::FeatureVendorXSfvfnrclipxfqf, RISCV::FeatureVendorXSfmmbase,
+ RISCV::FeatureVendorXSfvfexpa, RISCV::FeatureVendorXSfvfexpa64e,
+ RISCV::FeatureVendorXSfvfbfexp16e, RISCV::FeatureVendorXSfvfexp16e,
+ RISCV::FeatureVendorXSfvfexp32e};
static constexpr FeatureBitset XSfSystemGroup = {
RISCV::FeatureVendorXSiFivecdiscarddlone,
RISCV::FeatureVendorXSiFivecflushdlone,
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp
index 67b510d..f2b216b 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp
@@ -27,6 +27,7 @@
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/FormatVariadic.h"
#define GET_GICOMBINER_DEPS
#include "RISCVGenPostLegalizeGICombiner.inc"
@@ -42,6 +43,56 @@ namespace {
#include "RISCVGenPostLegalizeGICombiner.inc"
#undef GET_GICOMBINER_TYPES
+/// Match: G_STORE (G_FCONSTANT +0.0), addr
+/// Return the source vreg in MatchInfo if matched.
+bool matchFoldFPZeroStore(MachineInstr &MI, MachineRegisterInfo &MRI,
+ const RISCVSubtarget &STI, Register &MatchInfo) {
+ if (MI.getOpcode() != TargetOpcode::G_STORE)
+ return false;
+
+ Register SrcReg = MI.getOperand(0).getReg();
+ if (!SrcReg.isVirtual())
+ return false;
+
+ MachineInstr *Def = MRI.getVRegDef(SrcReg);
+ if (!Def || Def->getOpcode() != TargetOpcode::G_FCONSTANT)
+ return false;
+
+ auto *CFP = Def->getOperand(1).getFPImm();
+ if (!CFP || !CFP->getValueAPF().isPosZero())
+ return false;
+
+ unsigned ValBits = MRI.getType(SrcReg).getSizeInBits();
+ if ((ValBits == 16 && !STI.hasStdExtZfh()) ||
+ (ValBits == 32 && !STI.hasStdExtF()) ||
+ (ValBits == 64 && (!STI.hasStdExtD() || !STI.is64Bit())))
+ return false;
+
+ MatchInfo = SrcReg;
+ return true;
+}
+
+/// Apply: rewrite to G_STORE (G_CONSTANT 0 [XLEN]), addr
+void applyFoldFPZeroStore(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B, const RISCVSubtarget &STI,
+ Register &MatchInfo) {
+ const unsigned XLen = STI.getXLen();
+
+ auto Zero = B.buildConstant(LLT::scalar(XLen), 0);
+ MI.getOperand(0).setReg(Zero.getReg(0));
+
+ MachineInstr *Def = MRI.getVRegDef(MatchInfo);
+ if (Def && MRI.use_nodbg_empty(MatchInfo))
+ Def->eraseFromParent();
+
+#ifndef NDEBUG
+ unsigned ValBits = MRI.getType(MatchInfo).getSizeInBits();
+ LLVM_DEBUG(dbgs() << formatv("[{0}] Fold FP zero store -> int zero "
+ "(XLEN={1}, ValBits={2}):\n {3}\n",
+ DEBUG_TYPE, XLen, ValBits, MI));
+#endif
+}
+
class RISCVPostLegalizerCombinerImpl : public Combiner {
protected:
const CombinerHelper Helper;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index 50f5a5d..7b9c4b3 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -220,7 +220,8 @@ void RISCVInstPrinter::printVTypeI(const MCInst *MI, unsigned OpNo,
if (RISCVVType::getVLMUL(Imm) == RISCVVType::VLMUL::LMUL_RESERVED ||
RISCVVType::getSEW(Imm) > 64 ||
(RISCVVType::isAltFmt(Imm) &&
- !STI.hasFeature(RISCV::FeatureStdExtZvfbfa)) ||
+ !(STI.hasFeature(RISCV::FeatureStdExtZvfbfa) ||
+ STI.hasFeature(RISCV::FeatureVendorXSfvfbfexp16e))) ||
(Imm >> 9) != 0) {
O << formatImm(Imm);
return;
diff --git a/llvm/lib/Target/RISCV/RISCVCombine.td b/llvm/lib/Target/RISCV/RISCVCombine.td
index 995dd0c..a06b60d 100644
--- a/llvm/lib/Target/RISCV/RISCVCombine.td
+++ b/llvm/lib/Target/RISCV/RISCVCombine.td
@@ -19,11 +19,20 @@ def RISCVO0PreLegalizerCombiner: GICombiner<
"RISCVO0PreLegalizerCombinerImpl", [optnone_combines]> {
}
+// Rule: fold store (fp +0.0) -> store (int zero [XLEN])
+def fp_zero_store_matchdata : GIDefMatchData<"Register">;
+def fold_fp_zero_store : GICombineRule<
+ (defs root:$root, fp_zero_store_matchdata:$matchinfo),
+ (match (G_STORE $src, $addr):$root,
+ [{ return matchFoldFPZeroStore(*${root}, MRI, STI, ${matchinfo}); }]),
+ (apply [{ applyFoldFPZeroStore(*${root}, MRI, B, STI, ${matchinfo}); }])>;
+
// Post-legalization combines which are primarily optimizations.
// TODO: Add more combines.
def RISCVPostLegalizerCombiner
: GICombiner<"RISCVPostLegalizerCombinerImpl",
[sub_to_add, combines_for_extload, redundant_and,
identity_combines, shift_immed_chain,
- commute_constant_to_rhs, simplify_neg_minmax]> {
+ commute_constant_to_rhs, simplify_neg_minmax,
+ fold_fp_zero_store]> {
}
diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
index 5dd4bf4..98b636e 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
@@ -109,12 +109,70 @@ bool RISCVExpandAtomicPseudo::expandMI(MachineBasicBlock &MBB,
// expanded instructions for each pseudo is correct in the Size field of the
// tablegen definition for the pseudo.
switch (MBBI->getOpcode()) {
+ case RISCV::PseudoAtomicSwap32:
+ return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xchg, false, 32,
+ NextMBBI);
+ case RISCV::PseudoAtomicSwap64:
+ return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xchg, false, 64,
+ NextMBBI);
+ case RISCV::PseudoAtomicLoadAdd32:
+ return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Add, false, 32,
+ NextMBBI);
+ case RISCV::PseudoAtomicLoadAdd64:
+ return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Add, false, 64,
+ NextMBBI);
+ case RISCV::PseudoAtomicLoadSub32:
+ return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Sub, false, 32,
+ NextMBBI);
+ case RISCV::PseudoAtomicLoadSub64:
+ return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Sub, false, 64,
+ NextMBBI);
+ case RISCV::PseudoAtomicLoadAnd32:
+ return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::And, false, 32,
+ NextMBBI);
+ case RISCV::PseudoAtomicLoadAnd64:
+ return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::And, false, 64,
+ NextMBBI);
+ case RISCV::PseudoAtomicLoadOr32:
+ return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Or, false, 32, NextMBBI);
+ case RISCV::PseudoAtomicLoadOr64:
+ return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Or, false, 64, NextMBBI);
+ case RISCV::PseudoAtomicLoadXor32:
+ return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xor, false, 32,
+ NextMBBI);
+ case RISCV::PseudoAtomicLoadXor64:
+ return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xor, false, 64,
+ NextMBBI);
case RISCV::PseudoAtomicLoadNand32:
return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Nand, false, 32,
NextMBBI);
case RISCV::PseudoAtomicLoadNand64:
return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Nand, false, 64,
NextMBBI);
+ case RISCV::PseudoAtomicLoadMin32:
+ return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Min, false, 32,
+ NextMBBI);
+ case RISCV::PseudoAtomicLoadMin64:
+ return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Min, false, 64,
+ NextMBBI);
+ case RISCV::PseudoAtomicLoadMax32:
+ return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Max, false, 32,
+ NextMBBI);
+ case RISCV::PseudoAtomicLoadMax64:
+ return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Max, false, 64,
+ NextMBBI);
+ case RISCV::PseudoAtomicLoadUMin32:
+ return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMin, false, 32,
+ NextMBBI);
+ case RISCV::PseudoAtomicLoadUMin64:
+ return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMin, false, 64,
+ NextMBBI);
+ case RISCV::PseudoAtomicLoadUMax32:
+ return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMax, false, 32,
+ NextMBBI);
+ case RISCV::PseudoAtomicLoadUMax64:
+ return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMax, false, 64,
+ NextMBBI);
case RISCV::PseudoMaskedAtomicSwap32:
return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xchg, true, 32,
NextMBBI);
@@ -277,6 +335,36 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI,
switch (BinOp) {
default:
llvm_unreachable("Unexpected AtomicRMW BinOp");
+ case AtomicRMWInst::Xchg:
+ BuildMI(LoopMBB, DL, TII->get(RISCV::ADDI), ScratchReg)
+ .addReg(IncrReg)
+ .addImm(0);
+ break;
+ case AtomicRMWInst::Add:
+ BuildMI(LoopMBB, DL, TII->get(RISCV::ADD), ScratchReg)
+ .addReg(DestReg)
+ .addReg(IncrReg);
+ break;
+ case AtomicRMWInst::Sub:
+ BuildMI(LoopMBB, DL, TII->get(RISCV::SUB), ScratchReg)
+ .addReg(DestReg)
+ .addReg(IncrReg);
+ break;
+ case AtomicRMWInst::And:
+ BuildMI(LoopMBB, DL, TII->get(RISCV::AND), ScratchReg)
+ .addReg(DestReg)
+ .addReg(IncrReg);
+ break;
+ case AtomicRMWInst::Or:
+ BuildMI(LoopMBB, DL, TII->get(RISCV::OR), ScratchReg)
+ .addReg(DestReg)
+ .addReg(IncrReg);
+ break;
+ case AtomicRMWInst::Xor:
+ BuildMI(LoopMBB, DL, TII->get(RISCV::XOR), ScratchReg)
+ .addReg(DestReg)
+ .addReg(IncrReg);
+ break;
case AtomicRMWInst::Nand:
BuildMI(LoopMBB, DL, TII->get(RISCV::AND), ScratchReg)
.addReg(DestReg)
@@ -433,38 +521,85 @@ static void insertSext(const RISCVInstrInfo *TII, DebugLoc DL,
.addReg(ShamtReg);
}
-bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
- AtomicRMWInst::BinOp BinOp, bool IsMasked, int Width,
- MachineBasicBlock::iterator &NextMBBI) {
- assert(IsMasked == true &&
- "Should only need to expand masked atomic max/min");
- assert(Width == 32 && "Should never need to expand masked 64-bit operations");
+static void doAtomicMinMaxOpExpansion(
+ const RISCVInstrInfo *TII, MachineInstr &MI, DebugLoc DL,
+ MachineBasicBlock *ThisMBB, MachineBasicBlock *LoopHeadMBB,
+ MachineBasicBlock *LoopIfBodyMBB, MachineBasicBlock *LoopTailMBB,
+ MachineBasicBlock *DoneMBB, AtomicRMWInst::BinOp BinOp, int Width,
+ const RISCVSubtarget *STI) {
+ Register DestReg = MI.getOperand(0).getReg();
+ Register ScratchReg = MI.getOperand(1).getReg();
+ Register AddrReg = MI.getOperand(2).getReg();
+ Register IncrReg = MI.getOperand(3).getReg();
+ AtomicOrdering Ordering =
+ static_cast<AtomicOrdering>(MI.getOperand(4).getImm());
- MachineInstr &MI = *MBBI;
- DebugLoc DL = MI.getDebugLoc();
- MachineFunction *MF = MBB.getParent();
- auto LoopHeadMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
- auto LoopIfBodyMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
- auto LoopTailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
- auto DoneMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+ // .loophead:
+ // lr.[w|d] dest, (addr)
+ // mv scratch, dest
+ // ifnochangeneeded scratch, incr, .looptail
+ BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW(Ordering, Width, STI)), DestReg)
+ .addReg(AddrReg);
+ BuildMI(LoopHeadMBB, DL, TII->get(RISCV::ADDI), ScratchReg)
+ .addReg(DestReg)
+ .addImm(0);
+ switch (BinOp) {
+ default:
+ llvm_unreachable("Unexpected AtomicRMW BinOp");
+ case AtomicRMWInst::Max: {
+ BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGE))
+ .addReg(ScratchReg)
+ .addReg(IncrReg)
+ .addMBB(LoopTailMBB);
+ break;
+ }
+ case AtomicRMWInst::Min: {
+ BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGE))
+ .addReg(IncrReg)
+ .addReg(ScratchReg)
+ .addMBB(LoopTailMBB);
+ break;
+ }
+ case AtomicRMWInst::UMax:
+ BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGEU))
+ .addReg(ScratchReg)
+ .addReg(IncrReg)
+ .addMBB(LoopTailMBB);
+ break;
+ case AtomicRMWInst::UMin:
+ BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGEU))
+ .addReg(IncrReg)
+ .addReg(ScratchReg)
+ .addMBB(LoopTailMBB);
+ break;
+ }
- // Insert new MBBs.
- MF->insert(++MBB.getIterator(), LoopHeadMBB);
- MF->insert(++LoopHeadMBB->getIterator(), LoopIfBodyMBB);
- MF->insert(++LoopIfBodyMBB->getIterator(), LoopTailMBB);
- MF->insert(++LoopTailMBB->getIterator(), DoneMBB);
+ // .loopifbody:
+ // mv scratch, incr
+ BuildMI(LoopIfBodyMBB, DL, TII->get(RISCV::ADDI), ScratchReg)
+ .addReg(IncrReg)
+ .addImm(0);
- // Set up successors and transfer remaining instructions to DoneMBB.
- LoopHeadMBB->addSuccessor(LoopIfBodyMBB);
- LoopHeadMBB->addSuccessor(LoopTailMBB);
- LoopIfBodyMBB->addSuccessor(LoopTailMBB);
- LoopTailMBB->addSuccessor(LoopHeadMBB);
- LoopTailMBB->addSuccessor(DoneMBB);
- DoneMBB->splice(DoneMBB->end(), &MBB, MI, MBB.end());
- DoneMBB->transferSuccessors(&MBB);
- MBB.addSuccessor(LoopHeadMBB);
+ // .looptail:
+ // sc.[w|d] scratch, scratch, (addr)
+ // bnez scratch, loop
+ BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)),
+ ScratchReg)
+ .addReg(ScratchReg)
+ .addReg(AddrReg);
+ BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE))
+ .addReg(ScratchReg)
+ .addReg(RISCV::X0)
+ .addMBB(LoopHeadMBB);
+}
+static void doMaskedAtomicMinMaxOpExpansion(
+ const RISCVInstrInfo *TII, MachineInstr &MI, DebugLoc DL,
+ MachineBasicBlock *ThisMBB, MachineBasicBlock *LoopHeadMBB,
+ MachineBasicBlock *LoopIfBodyMBB, MachineBasicBlock *LoopTailMBB,
+ MachineBasicBlock *DoneMBB, AtomicRMWInst::BinOp BinOp, int Width,
+ const RISCVSubtarget *STI) {
+ assert(Width == 32 && "Should never need to expand masked 64-bit operations");
Register DestReg = MI.getOperand(0).getReg();
Register Scratch1Reg = MI.getOperand(1).getReg();
Register Scratch2Reg = MI.getOperand(2).getReg();
@@ -541,6 +676,44 @@ bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp(
.addReg(Scratch1Reg)
.addReg(RISCV::X0)
.addMBB(LoopHeadMBB);
+}
+
+bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ AtomicRMWInst::BinOp BinOp, bool IsMasked, int Width,
+ MachineBasicBlock::iterator &NextMBBI) {
+
+ MachineInstr &MI = *MBBI;
+ DebugLoc DL = MI.getDebugLoc();
+ MachineFunction *MF = MBB.getParent();
+ auto LoopHeadMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+ auto LoopIfBodyMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+ auto LoopTailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+ auto DoneMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+ // Insert new MBBs.
+ MF->insert(++MBB.getIterator(), LoopHeadMBB);
+ MF->insert(++LoopHeadMBB->getIterator(), LoopIfBodyMBB);
+ MF->insert(++LoopIfBodyMBB->getIterator(), LoopTailMBB);
+ MF->insert(++LoopTailMBB->getIterator(), DoneMBB);
+
+ // Set up successors and transfer remaining instructions to DoneMBB.
+ LoopHeadMBB->addSuccessor(LoopIfBodyMBB);
+ LoopHeadMBB->addSuccessor(LoopTailMBB);
+ LoopIfBodyMBB->addSuccessor(LoopTailMBB);
+ LoopTailMBB->addSuccessor(LoopHeadMBB);
+ LoopTailMBB->addSuccessor(DoneMBB);
+ DoneMBB->splice(DoneMBB->end(), &MBB, MI, MBB.end());
+ DoneMBB->transferSuccessors(&MBB);
+ MBB.addSuccessor(LoopHeadMBB);
+
+ if (!IsMasked)
+ doAtomicMinMaxOpExpansion(TII, MI, DL, &MBB, LoopHeadMBB, LoopIfBodyMBB,
+ LoopTailMBB, DoneMBB, BinOp, Width, STI);
+ else
+ doMaskedAtomicMinMaxOpExpansion(TII, MI, DL, &MBB, LoopHeadMBB,
+ LoopIfBodyMBB, LoopTailMBB, DoneMBB, BinOp,
+ Width, STI);
NextMBBI = MBB.end();
MI.eraseFromParent();
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 19992e6..9e6b7f0 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -218,6 +218,7 @@ def HasStdExtZaamo
: Predicate<"Subtarget->hasStdExtZaamo()">,
AssemblerPredicate<(any_of FeatureStdExtZaamo),
"'Zaamo' (Atomic Memory Operations)">;
+def NoStdExtZaamo : Predicate<"!Subtarget->hasStdExtZaamo()">;
def FeatureStdExtZalrsc
: RISCVExtension<1, 0, "Load-Reserved/Store-Conditional">;
@@ -1334,6 +1335,44 @@ def HasVendorXSfvfnrclipxfqf
AssemblerPredicate<(all_of FeatureVendorXSfvfnrclipxfqf),
"'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions)">;
+// Note: XSfvfbfexp16e depends on either Zvfbfmin _or_ Zvfbfa, which cannot be expressed here in
+// TableGen. Instead, we check that in RISCVISAInfo.
+def FeatureVendorXSfvfbfexp16e
+ : RISCVExtension<0, 5,
+ "SiFive Vector Floating-Point Exponential Function Instruction, BFloat16">;
+def HasVendorXSfvfbfexp16e : Predicate<"Subtarget->hasVendorXSfvfbfexp16e()">;
+
+def FeatureVendorXSfvfexp16e
+ : RISCVExtension<0, 5,
+ "SiFive Vector Floating-Point Exponential Function Instruction, Half Precision",
+ [FeatureStdExtZvfh]>;
+def HasVendorXSfvfexp16e : Predicate<"Subtarget->hasVendorXSfvfexp16e()">;
+
+def FeatureVendorXSfvfexp32e
+ : RISCVExtension<0, 5,
+ "SiFive Vector Floating-Point Exponential Function Instruction, Single Precision",
+ [FeatureStdExtZve32f]>;
+def HasVendorXSfvfexp32e : Predicate<"Subtarget->hasVendorXSfvfexp32e()">;
+
+def HasVendorXSfvfexpAnyFloat : Predicate<"Subtarget->hasVendorXSfvfexp16e() || Subtarget->hasVendorXSfvfexp32e()">;
+def HasVendorXSfvfexpAny : Predicate<"Subtarget->hasVendorXSfvfbfexp16e() || Subtarget->hasVendorXSfvfexp16e() || Subtarget->hasVendorXSfvfexp32e()">,
+ AssemblerPredicate<(any_of FeatureVendorXSfvfbfexp16e, FeatureVendorXSfvfexp16e, FeatureVendorXSfvfexp32e),
+ "'Xsfvfbfexp16e', 'Xsfvfexp16e', or 'Xsfvfexp32e' (SiFive Vector Floating-Point Exponential Function Instruction)">;
+
+def FeatureVendorXSfvfexpa
+ : RISCVExtension<0, 2,
+ "SiFive Vector Floating-Point Exponential Approximation Instruction",
+ [FeatureStdExtZve32f]>;
+def HasVendorXSfvfexpa : Predicate<"Subtarget->hasVendorXSfvfexpa()">,
+ AssemblerPredicate<(all_of FeatureVendorXSfvfexpa),
+ "'Xsfvfexpa' (SiFive Vector Floating-Point Exponential Approximation Instruction)">;
+
+def FeatureVendorXSfvfexpa64e
+ : RISCVExtension<0, 2,
+ "SiFive Vector Floating-Point Exponential Approximation Instruction with Double-Precision",
+ [FeatureVendorXSfvfexpa, FeatureStdExtZve64d]>;
+def HasVendorXSfvfexpa64e : Predicate<"Subtarget->hasVendorXSfvfexpa64e()">;
+
def FeatureVendorXSiFivecdiscarddlone
: RISCVExtension<1, 0,
"SiFive sf.cdiscard.d.l1 Instruction", []>;
@@ -1864,7 +1903,7 @@ def FeatureForcedAtomics : SubtargetFeature<
"forced-atomics", "HasForcedAtomics", "true",
"Assume that lock-free native-width atomics are available">;
def HasAtomicLdSt
- : Predicate<"Subtarget->hasStdExtA() || Subtarget->hasForcedAtomics()">;
+ : Predicate<"Subtarget->hasStdExtZalrsc() || Subtarget->hasForcedAtomics()">;
def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
"AllowTaggedGlobals",
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index a77d765..26fe9ed 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -688,7 +688,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
else if (Subtarget.hasStdExtZicbop())
setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
- if (Subtarget.hasStdExtA()) {
+ if (Subtarget.hasStdExtZalrsc()) {
setMaxAtomicSizeInBitsSupported(Subtarget.getXLen());
if (Subtarget.hasStdExtZabha() && Subtarget.hasStdExtZacas())
setMinCmpXchgSizeInBits(8);
@@ -1558,7 +1558,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
}
}
- if (Subtarget.hasStdExtA())
+ if (Subtarget.hasStdExtZaamo())
setOperationAction(ISD::ATOMIC_LOAD_SUB, XLenVT, Expand);
if (Subtarget.hasForcedAtomics()) {
@@ -21875,7 +21875,7 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
// result is then sign extended to XLEN. With +A, the minimum width is
// 32 for both 64 and 32.
assert(getMinCmpXchgSizeInBits() == 32);
- assert(Subtarget.hasStdExtA());
+ assert(Subtarget.hasStdExtZalrsc());
return Op.getValueSizeInBits() - 31;
}
break;
@@ -24471,6 +24471,25 @@ ISD::NodeType RISCVTargetLowering::getExtendForAtomicCmpSwapArg() const {
return Subtarget.hasStdExtZacas() ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
}
+ISD::NodeType RISCVTargetLowering::getExtendForAtomicRMWArg(unsigned Op) const {
+ // Zaamo will use amo<op>.w which does not require extension.
+ if (Subtarget.hasStdExtZaamo() || Subtarget.hasForcedAtomics())
+ return ISD::ANY_EXTEND;
+
+ // Zalrsc pseudo expansions with comparison require sign-extension.
+ assert(Subtarget.hasStdExtZalrsc());
+ switch (Op) {
+ case ISD::ATOMIC_LOAD_MIN:
+ case ISD::ATOMIC_LOAD_MAX:
+ case ISD::ATOMIC_LOAD_UMIN:
+ case ISD::ATOMIC_LOAD_UMAX:
+ return ISD::SIGN_EXTEND;
+ default:
+ break;
+ }
+ return ISD::ANY_EXTEND;
+}
+
Register RISCVTargetLowering::getExceptionPointerRegister(
const Constant *PersonalityFn) const {
return RISCV::X10;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 3f81ed7..9e3e2a9 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -245,6 +245,7 @@ public:
}
ISD::NodeType getExtendForAtomicCmpSwapArg() const override;
+ ISD::NodeType getExtendForAtomicRMWArg(unsigned Op) const override;
bool shouldTransformSignedTruncationCheck(EVT XVT,
unsigned KeptBits) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 12f776b..912b82d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1689,42 +1689,44 @@ bool RISCVInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
// instruction opcode. Otherwise, return RISCV::INSTRUCTION_LIST_END.
// TODO: Support more operations.
unsigned getPredicatedOpcode(unsigned Opcode) {
+ // clang-format off
switch (Opcode) {
- case RISCV::ADD: return RISCV::PseudoCCADD; break;
- case RISCV::SUB: return RISCV::PseudoCCSUB; break;
- case RISCV::SLL: return RISCV::PseudoCCSLL; break;
- case RISCV::SRL: return RISCV::PseudoCCSRL; break;
- case RISCV::SRA: return RISCV::PseudoCCSRA; break;
- case RISCV::AND: return RISCV::PseudoCCAND; break;
- case RISCV::OR: return RISCV::PseudoCCOR; break;
- case RISCV::XOR: return RISCV::PseudoCCXOR; break;
-
- case RISCV::ADDI: return RISCV::PseudoCCADDI; break;
- case RISCV::SLLI: return RISCV::PseudoCCSLLI; break;
- case RISCV::SRLI: return RISCV::PseudoCCSRLI; break;
- case RISCV::SRAI: return RISCV::PseudoCCSRAI; break;
- case RISCV::ANDI: return RISCV::PseudoCCANDI; break;
- case RISCV::ORI: return RISCV::PseudoCCORI; break;
- case RISCV::XORI: return RISCV::PseudoCCXORI; break;
-
- case RISCV::ADDW: return RISCV::PseudoCCADDW; break;
- case RISCV::SUBW: return RISCV::PseudoCCSUBW; break;
- case RISCV::SLLW: return RISCV::PseudoCCSLLW; break;
- case RISCV::SRLW: return RISCV::PseudoCCSRLW; break;
- case RISCV::SRAW: return RISCV::PseudoCCSRAW; break;
-
- case RISCV::ADDIW: return RISCV::PseudoCCADDIW; break;
- case RISCV::SLLIW: return RISCV::PseudoCCSLLIW; break;
- case RISCV::SRLIW: return RISCV::PseudoCCSRLIW; break;
- case RISCV::SRAIW: return RISCV::PseudoCCSRAIW; break;
-
- case RISCV::ANDN: return RISCV::PseudoCCANDN; break;
- case RISCV::ORN: return RISCV::PseudoCCORN; break;
- case RISCV::XNOR: return RISCV::PseudoCCXNOR; break;
-
- case RISCV::NDS_BFOS: return RISCV::PseudoCCNDS_BFOS; break;
- case RISCV::NDS_BFOZ: return RISCV::PseudoCCNDS_BFOZ; break;
+ case RISCV::ADD: return RISCV::PseudoCCADD;
+ case RISCV::SUB: return RISCV::PseudoCCSUB;
+ case RISCV::SLL: return RISCV::PseudoCCSLL;
+ case RISCV::SRL: return RISCV::PseudoCCSRL;
+ case RISCV::SRA: return RISCV::PseudoCCSRA;
+ case RISCV::AND: return RISCV::PseudoCCAND;
+ case RISCV::OR: return RISCV::PseudoCCOR;
+ case RISCV::XOR: return RISCV::PseudoCCXOR;
+
+ case RISCV::ADDI: return RISCV::PseudoCCADDI;
+ case RISCV::SLLI: return RISCV::PseudoCCSLLI;
+ case RISCV::SRLI: return RISCV::PseudoCCSRLI;
+ case RISCV::SRAI: return RISCV::PseudoCCSRAI;
+ case RISCV::ANDI: return RISCV::PseudoCCANDI;
+ case RISCV::ORI: return RISCV::PseudoCCORI;
+ case RISCV::XORI: return RISCV::PseudoCCXORI;
+
+ case RISCV::ADDW: return RISCV::PseudoCCADDW;
+ case RISCV::SUBW: return RISCV::PseudoCCSUBW;
+ case RISCV::SLLW: return RISCV::PseudoCCSLLW;
+ case RISCV::SRLW: return RISCV::PseudoCCSRLW;
+ case RISCV::SRAW: return RISCV::PseudoCCSRAW;
+
+ case RISCV::ADDIW: return RISCV::PseudoCCADDIW;
+ case RISCV::SLLIW: return RISCV::PseudoCCSLLIW;
+ case RISCV::SRLIW: return RISCV::PseudoCCSRLIW;
+ case RISCV::SRAIW: return RISCV::PseudoCCSRAIW;
+
+ case RISCV::ANDN: return RISCV::PseudoCCANDN;
+ case RISCV::ORN: return RISCV::PseudoCCORN;
+ case RISCV::XNOR: return RISCV::PseudoCCXNOR;
+
+ case RISCV::NDS_BFOS: return RISCV::PseudoCCNDS_BFOS;
+ case RISCV::NDS_BFOZ: return RISCV::PseudoCCNDS_BFOZ;
}
+ // clang-format on
return RISCV::INSTRUCTION_LIST_END;
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
index 571d72f..5c81a09 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
@@ -158,9 +158,9 @@ class seq_cst_store<PatFrag base>
}
} // IsAtomic = 1
-// Atomic load/store are available under both +a and +force-atomics.
-// Fences will be inserted for atomic load/stores according to the logic in
-// RISCVTargetLowering::{emitLeadingFence,emitTrailingFence}.
+// Atomic load/store are available under +zalrsc (thus also +a) and
+// +force-atomics. Fences will be inserted for atomic load/stores according to
+// the logic in RISCVTargetLowering::{emitLeadingFence,emitTrailingFence}.
// The normal loads/stores are relaxed (unordered) loads/stores that don't have
// any ordering. This is necessary because AtomicExpandPass has added fences to
// atomic load/stores and changed them to unordered ones.
@@ -308,7 +308,65 @@ class PseudoMaskedAMOMinMaxPat<Intrinsic intrin, Pseudo AMOInst>
(AMOInst GPR:$addr, GPR:$incr, GPR:$mask, GPR:$shiftamt,
timm:$ordering)>;
-let Predicates = [HasStdExtA] in {
+let Predicates = [HasStdExtZalrsc, NoStdExtZaamo] in {
+
+let Size = 16 in {
+def PseudoAtomicSwap32 : PseudoAMO;
+def PseudoAtomicLoadAdd32 : PseudoAMO;
+def PseudoAtomicLoadSub32 : PseudoAMO;
+def PseudoAtomicLoadAnd32 : PseudoAMO;
+def PseudoAtomicLoadOr32 : PseudoAMO;
+def PseudoAtomicLoadXor32 : PseudoAMO;
+} // Size = 16
+let Size = 24 in {
+def PseudoAtomicLoadMax32 : PseudoAMO;
+def PseudoAtomicLoadMin32 : PseudoAMO;
+def PseudoAtomicLoadUMax32 : PseudoAMO;
+def PseudoAtomicLoadUMin32 : PseudoAMO;
+} // Size = 24
+
+defm : PseudoAMOPat<"atomic_swap_i32", PseudoAtomicSwap32>;
+defm : PseudoAMOPat<"atomic_load_add_i32", PseudoAtomicLoadAdd32>;
+defm : PseudoAMOPat<"atomic_load_sub_i32", PseudoAtomicLoadSub32>;
+defm : PseudoAMOPat<"atomic_load_and_i32", PseudoAtomicLoadAnd32>;
+defm : PseudoAMOPat<"atomic_load_or_i32", PseudoAtomicLoadOr32>;
+defm : PseudoAMOPat<"atomic_load_xor_i32", PseudoAtomicLoadXor32>;
+defm : PseudoAMOPat<"atomic_load_max_i32", PseudoAtomicLoadMax32>;
+defm : PseudoAMOPat<"atomic_load_min_i32", PseudoAtomicLoadMin32>;
+defm : PseudoAMOPat<"atomic_load_umax_i32", PseudoAtomicLoadUMax32>;
+defm : PseudoAMOPat<"atomic_load_umin_i32", PseudoAtomicLoadUMin32>;
+} // Predicates = [HasStdExtZalrsc, NoStdExtZaamo]
+
+let Predicates = [HasStdExtZalrsc, NoStdExtZaamo, IsRV64] in {
+
+let Size = 16 in {
+def PseudoAtomicSwap64 : PseudoAMO;
+def PseudoAtomicLoadAdd64 : PseudoAMO;
+def PseudoAtomicLoadSub64 : PseudoAMO;
+def PseudoAtomicLoadAnd64 : PseudoAMO;
+def PseudoAtomicLoadOr64 : PseudoAMO;
+def PseudoAtomicLoadXor64 : PseudoAMO;
+} // Size = 16
+let Size = 24 in {
+def PseudoAtomicLoadMax64 : PseudoAMO;
+def PseudoAtomicLoadMin64 : PseudoAMO;
+def PseudoAtomicLoadUMax64 : PseudoAMO;
+def PseudoAtomicLoadUMin64 : PseudoAMO;
+} // Size = 24
+
+defm : PseudoAMOPat<"atomic_swap_i64", PseudoAtomicSwap64, i64>;
+defm : PseudoAMOPat<"atomic_load_add_i64", PseudoAtomicLoadAdd64, i64>;
+defm : PseudoAMOPat<"atomic_load_sub_i64", PseudoAtomicLoadSub64, i64>;
+defm : PseudoAMOPat<"atomic_load_and_i64", PseudoAtomicLoadAnd64, i64>;
+defm : PseudoAMOPat<"atomic_load_or_i64", PseudoAtomicLoadOr64, i64>;
+defm : PseudoAMOPat<"atomic_load_xor_i64", PseudoAtomicLoadXor64, i64>;
+defm : PseudoAMOPat<"atomic_load_max_i64", PseudoAtomicLoadMax64, i64>;
+defm : PseudoAMOPat<"atomic_load_min_i64", PseudoAtomicLoadMin64, i64>;
+defm : PseudoAMOPat<"atomic_load_umax_i64", PseudoAtomicLoadUMax64, i64>;
+defm : PseudoAMOPat<"atomic_load_umin_i64", PseudoAtomicLoadUMin64, i64>;
+} // Predicates = [HasStdExtZalrsc, NoStdExtZaamo, IsRV64]
+
+let Predicates = [HasStdExtZalrsc] in {
let Size = 20 in
def PseudoAtomicLoadNand32 : PseudoAMO;
@@ -347,14 +405,14 @@ def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_umax,
PseudoMaskedAtomicLoadUMax32>;
def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_umin,
PseudoMaskedAtomicLoadUMin32>;
-} // Predicates = [HasStdExtA]
+} // Predicates = [HasStdExtZalrsc]
-let Predicates = [HasStdExtA, IsRV64] in {
+let Predicates = [HasStdExtZalrsc, IsRV64] in {
let Size = 20 in
def PseudoAtomicLoadNand64 : PseudoAMO;
defm : PseudoAMOPat<"atomic_load_nand_i64", PseudoAtomicLoadNand64, i64>;
-} // Predicates = [HasStdExtA, IsRV64]
+} // Predicates = [HasStdExtZalrsc, IsRV64]
/// Compare and exchange
@@ -385,17 +443,17 @@ multiclass PseudoCmpXchgPat<string Op, Pseudo CmpXchgInst,
(CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 7)>;
}
-let Predicates = [HasStdExtA, NoStdExtZacas] in {
+let Predicates = [HasStdExtZalrsc, NoStdExtZacas] in {
def PseudoCmpXchg32 : PseudoCmpXchg;
defm : PseudoCmpXchgPat<"atomic_cmp_swap_i32", PseudoCmpXchg32>;
}
-let Predicates = [HasStdExtA, NoStdExtZacas, IsRV64] in {
+let Predicates = [HasStdExtZalrsc, NoStdExtZacas, IsRV64] in {
def PseudoCmpXchg64 : PseudoCmpXchg;
defm : PseudoCmpXchgPat<"atomic_cmp_swap_i64", PseudoCmpXchg64, i64>;
}
-let Predicates = [HasStdExtA] in {
+let Predicates = [HasStdExtZalrsc] in {
def PseudoMaskedCmpXchg32
: Pseudo<(outs GPR:$res, GPR:$scratch),
(ins GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask,
@@ -412,4 +470,4 @@ def : Pat<(XLenVT (int_riscv_masked_cmpxchg
(XLenVT GPR:$mask), (XLenVT timm:$ordering))),
(PseudoMaskedCmpXchg32
GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering)>;
-} // Predicates = [HasStdExtA]
+} // Predicates = [HasStdExtZalrsc]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index 6a4119a..4104abd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -217,6 +217,14 @@ let Predicates = [HasVendorXSfvcp], mayLoad = 0, mayStore = 0,
defm FVW : CustomSiFiveVCIX<"fvw", VCIX_XVW, VR, VR, FPR32>, Sched<[]>;
}
+let Predicates = [HasVendorXSfvfexpAny], DecoderNamespace = "XSfvector" in {
+ def SF_VFEXP_V : VALUVs2<0b010011, 0b00111, OPFVV, "sf.vfexp.v">;
+}
+
+let Predicates = [HasVendorXSfvfexpa], DecoderNamespace = "XSfvector" in {
+ def SF_VFEXPA_V : VALUVs2<0b010011, 0b00110, OPFVV, "sf.vfexpa.v">;
+}
+
let Predicates = [HasVendorXSfvqmaccdod], DecoderNamespace = "XSfvector",
DestEEW = EEWSEWx4, RVVConstraint=VS2Constraint in {
def SF_VQMACCU_2x8x2 : CustomSiFiveVMACC<0b101100, OPMVV, "sf.vqmaccu.2x8x2">;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
index f7d1a09..b9c5b75 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
@@ -668,4 +668,38 @@ foreach vti = NoGroupBF16Vectors in {
def : Pat<(vti.Scalar (extractelt (vti.Vector vti.RegClass:$rs2), 0)),
(vfmv_f_s_inst vti.RegClass:$rs2, vti.Log2SEW)>;
}
+
+let Predicates = [HasStdExtZvfbfa] in {
+ foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
+ defvar fvti = fvtiToFWti.Vti;
+ defvar fwti = fvtiToFWti.Wti;
+ def : Pat<(fwti.Vector (any_riscv_fpextend_vl
+ (fvti.Vector fvti.RegClass:$rs1),
+ (fvti.Mask VMV0:$vm),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVFWCVT_F_F_ALT_V_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
+ (fwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1,
+ (fvti.Mask VMV0:$vm),
+ GPR:$vl, fvti.Log2SEW, TA_MA)>;
+
+ def : Pat<(fvti.Vector (any_riscv_fpround_vl
+ (fwti.Vector fwti.RegClass:$rs1),
+ (fwti.Mask VMV0:$vm), VLOpFrag)),
+ (!cast<Instruction>("PseudoVFNCVT_F_F_ALT_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
+ (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
+ (fwti.Mask VMV0:$vm),
+ // Value to indicate no rounding mode change in
+ // RISCVInsertReadWriteCSR
+ FRM_DYN,
+ GPR:$vl, fvti.Log2SEW, TA_MA)>;
+ def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))),
+ (!cast<Instruction>("PseudoVFNCVT_F_F_ALT_W_"#fvti.LMul.MX#"_E"#fvti.SEW)
+ (fvti.Vector (IMPLICIT_DEF)),
+ fwti.RegClass:$rs1,
+ // Value to indicate no rounding mode change in
+ // RISCVInsertReadWriteCSR
+ FRM_DYN,
+ fvti.AVL, fvti.Log2SEW, TA_MA)>;
+ }
+}
} // Predicates = [HasStdExtZvfbfa]
diff --git a/llvm/lib/Target/SPIRV/SPIRVCBufferAccess.cpp b/llvm/lib/Target/SPIRV/SPIRVCBufferAccess.cpp
index f7fb886..3ca0b40 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCBufferAccess.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCBufferAccess.cpp
@@ -35,6 +35,7 @@
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsSPIRV.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/ReplaceConstant.h"
#define DEBUG_TYPE "spirv-cbuffer-access"
using namespace llvm;
@@ -57,6 +58,12 @@ static bool replaceCBufferAccesses(Module &M) {
if (!CBufMD)
return false;
+ SmallVector<Constant *> CBufferGlobals;
+ for (const hlsl::CBufferMapping &Mapping : *CBufMD)
+ for (const hlsl::CBufferMember &Member : Mapping.Members)
+ CBufferGlobals.push_back(Member.GV);
+ convertUsersOfConstantsToInstructions(CBufferGlobals);
+
for (const hlsl::CBufferMapping &Mapping : *CBufMD) {
Instruction *HandleDef = findHandleDef(Mapping.Handle);
if (!HandleDef) {
@@ -80,12 +87,7 @@ static bool replaceCBufferAccesses(Module &M) {
Value *GetPointerCall = Builder.CreateIntrinsic(
PtrType, Intrinsic::spv_resource_getpointer, {HandleDef, IndexVal});
- // We cannot use replaceAllUsesWith here because some uses may be
- // ConstantExprs, which cannot be replaced with non-constants.
- SmallVector<User *, 4> Users(MemberGV->users());
- for (User *U : Users) {
- U->replaceUsesOfWith(MemberGV, GetPointerCall);
- }
+ MemberGV->replaceAllUsesWith(GetPointerCall);
}
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 5591d9f..021353a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -355,9 +355,9 @@ private:
SPIRVType *widenTypeToVec4(const SPIRVType *Type, MachineInstr &I) const;
bool extractSubvector(Register &ResVReg, const SPIRVType *ResType,
Register &ReadReg, MachineInstr &InsertionPoint) const;
- bool generateImageRead(Register &ResVReg, const SPIRVType *ResType,
- Register ImageReg, Register IdxReg, DebugLoc Loc,
- MachineInstr &Pos) const;
+ bool generateImageReadOrFetch(Register &ResVReg, const SPIRVType *ResType,
+ Register ImageReg, Register IdxReg,
+ DebugLoc Loc, MachineInstr &Pos) const;
bool BuildCOPY(Register DestReg, Register SrcReg, MachineInstr &I) const;
bool loadVec3BuiltinInputID(SPIRV::BuiltIn::BuiltIn BuiltInValue,
Register ResVReg, const SPIRVType *ResType,
@@ -1321,8 +1321,8 @@ bool SPIRVInstructionSelector::selectLoad(Register ResVReg,
}
Register IdxReg = IntPtrDef->getOperand(3).getReg();
- return generateImageRead(ResVReg, ResType, NewHandleReg, IdxReg,
- I.getDebugLoc(), I);
+ return generateImageReadOrFetch(ResVReg, ResType, NewHandleReg, IdxReg,
+ I.getDebugLoc(), I);
}
}
@@ -3639,27 +3639,33 @@ bool SPIRVInstructionSelector::selectReadImageIntrinsic(
DebugLoc Loc = I.getDebugLoc();
MachineInstr &Pos = I;
- return generateImageRead(ResVReg, ResType, NewImageReg, IdxReg, Loc, Pos);
+ return generateImageReadOrFetch(ResVReg, ResType, NewImageReg, IdxReg, Loc,
+ Pos);
}
-bool SPIRVInstructionSelector::generateImageRead(Register &ResVReg,
- const SPIRVType *ResType,
- Register ImageReg,
- Register IdxReg, DebugLoc Loc,
- MachineInstr &Pos) const {
+bool SPIRVInstructionSelector::generateImageReadOrFetch(
+ Register &ResVReg, const SPIRVType *ResType, Register ImageReg,
+ Register IdxReg, DebugLoc Loc, MachineInstr &Pos) const {
SPIRVType *ImageType = GR.getSPIRVTypeForVReg(ImageReg);
assert(ImageType && ImageType->getOpcode() == SPIRV::OpTypeImage &&
"ImageReg is not an image type.");
+
bool IsSignedInteger =
sampledTypeIsSignedInteger(GR.getTypeForSPIRVType(ImageType));
+ // Check if the "sampled" operand of the image type is 1.
+ // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpImageFetch
+ auto SampledOp = ImageType->getOperand(6);
+ bool IsFetch = (SampledOp.getImm() == 1);
uint64_t ResultSize = GR.getScalarOrVectorComponentCount(ResType);
if (ResultSize == 4) {
- auto BMI = BuildMI(*Pos.getParent(), Pos, Loc, TII.get(SPIRV::OpImageRead))
- .addDef(ResVReg)
- .addUse(GR.getSPIRVTypeID(ResType))
- .addUse(ImageReg)
- .addUse(IdxReg);
+ auto BMI =
+ BuildMI(*Pos.getParent(), Pos, Loc,
+ TII.get(IsFetch ? SPIRV::OpImageFetch : SPIRV::OpImageRead))
+ .addDef(ResVReg)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addUse(ImageReg)
+ .addUse(IdxReg);
if (IsSignedInteger)
BMI.addImm(0x1000); // SignExtend
@@ -3668,11 +3674,13 @@ bool SPIRVInstructionSelector::generateImageRead(Register &ResVReg,
SPIRVType *ReadType = widenTypeToVec4(ResType, Pos);
Register ReadReg = MRI->createVirtualRegister(GR.getRegClass(ReadType));
- auto BMI = BuildMI(*Pos.getParent(), Pos, Loc, TII.get(SPIRV::OpImageRead))
- .addDef(ReadReg)
- .addUse(GR.getSPIRVTypeID(ReadType))
- .addUse(ImageReg)
- .addUse(IdxReg);
+ auto BMI =
+ BuildMI(*Pos.getParent(), Pos, Loc,
+ TII.get(IsFetch ? SPIRV::OpImageFetch : SPIRV::OpImageRead))
+ .addDef(ReadReg)
+ .addUse(GR.getSPIRVTypeID(ReadType))
+ .addUse(ImageReg)
+ .addUse(IdxReg);
if (IsSignedInteger)
BMI.addImm(0x1000); // SignExtend
bool Succeed = BMI.constrainAllUses(TII, TRI, RBI);
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index cf85691..9bda8a4 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -158,7 +158,6 @@ void TargetMachine::resetTargetOptions(const Function &F) const {
Options.X = F.getFnAttribute(Y).getValueAsBool(); \
} while (0)
- RESET_OPTION(UnsafeFPMath, "unsafe-fp-math");
RESET_OPTION(NoInfsFPMath, "no-infs-fp-math");
RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math");
RESET_OPTION(NoSignedZerosFPMath, "no-signed-zeros-fp-math");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index f973949..7ec463b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -183,6 +183,11 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
for (auto T : {MVT::i32, MVT::i64})
setOperationAction(Op, T, Custom);
+ if (Subtarget->hasRelaxedSIMD()) {
+ setOperationAction(
+ {ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXNUM, ISD::FMAXIMUMNUM},
+ {MVT::v4f32, MVT::v2f64}, Legal);
+ }
// SIMD-specific configuration
if (Subtarget->hasSIMD128()) {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 7840620..14097d7 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1336,22 +1336,25 @@ def pmax : PatFrags<(ops node:$lhs, node:$rhs), [
]>;
defm PMAX : SIMDBinaryFP<pmax, "pmax", 235>;
+multiclass PMinMaxInt<Vec vec, NI baseMinInst, NI baseMaxInst> {
+ def : Pat<(vec.int_vt (vselect
+ (setolt (vec.vt (bitconvert V128:$rhs)),
+ (vec.vt (bitconvert V128:$lhs))),
+ V128:$rhs, V128:$lhs)),
+ (baseMinInst $lhs, $rhs)>;
+ def : Pat<(vec.int_vt (vselect
+ (setolt (vec.vt (bitconvert V128:$lhs)),
+ (vec.vt (bitconvert V128:$rhs))),
+ V128:$rhs, V128:$lhs)),
+ (baseMaxInst $lhs, $rhs)>;
+}
// Also match the pmin/pmax cases where the operands are int vectors (but the
// comparison is still a floating point comparison). This can happen when using
// the wasm_simd128.h intrinsics because v128_t is an integer vector.
foreach vec = [F32x4, F64x2, F16x8] in {
-defvar pmin = !cast<NI>("PMIN_"#vec);
-defvar pmax = !cast<NI>("PMAX_"#vec);
-def : Pat<(vec.int_vt (vselect
- (setolt (vec.vt (bitconvert V128:$rhs)),
- (vec.vt (bitconvert V128:$lhs))),
- V128:$rhs, V128:$lhs)),
- (pmin $lhs, $rhs)>;
-def : Pat<(vec.int_vt (vselect
- (setolt (vec.vt (bitconvert V128:$lhs)),
- (vec.vt (bitconvert V128:$rhs))),
- V128:$rhs, V128:$lhs)),
- (pmax $lhs, $rhs)>;
+ defvar pmin = !cast<NI>("PMIN_"#vec);
+ defvar pmax = !cast<NI>("PMAX_"#vec);
+ defm : PMinMaxInt<vec, pmin, pmax>;
}
// And match the pmin/pmax LLVM intrinsics as well
@@ -1742,6 +1745,32 @@ defm SIMD_RELAXED_FMIN :
defm SIMD_RELAXED_FMAX :
RelaxedBinary<F64x2, int_wasm_relaxed_max, "relaxed_max", 0x110>;
+let Predicates = [HasRelaxedSIMD] in {
+ foreach vec = [F32x4, F64x2] in {
+ defvar relaxed_min = !cast<NI>("SIMD_RELAXED_FMIN_"#vec);
+ defvar relaxed_max = !cast<NI>("SIMD_RELAXED_FMAX_"#vec);
+
+ // Transform standard fminimum/fmaximum to relaxed versions
+ def : Pat<(vec.vt (fminnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
+ (relaxed_min V128:$lhs, V128:$rhs)>;
+ def : Pat<(vec.vt (fminimumnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
+ (relaxed_min V128:$lhs, V128:$rhs)>;
+ def : Pat<(vec.vt (fmaxnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
+ (relaxed_max V128:$lhs, V128:$rhs)>;
+ def : Pat<(vec.vt (fmaximumnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
+ (relaxed_max V128:$lhs, V128:$rhs)>;
+
+ // Transform pmin/max-supposed patterns to relaxed min max
+ let AddedComplexity = 1 in {
+ def : Pat<(vec.vt (pmin (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
+ (relaxed_min $lhs, $rhs)>;
+ def : Pat<(vec.vt (pmax (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
+ (relaxed_max $lhs, $rhs)>;
+ defm : PMinMaxInt<vec, relaxed_min, relaxed_max>;
+ }
+ }
+}
+
//===----------------------------------------------------------------------===//
// Relaxed rounding q15 multiplication
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b5f8ee5..d49f25a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -20558,7 +20558,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL,
// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
// NOTE: By using fsub of a positive constant instead of fadd of a negative
- // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
+ // constant, we avoid reassociation in MachineCombiner when reassoc is
// enabled. See PR24512.
SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
// TODO: Are there any fast-math-flags to propagate here?
@@ -29516,11 +29516,8 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX))
return MinMax;
- if (DAG.isKnownNeverNaN(NewX))
- NewX = NewY;
-
- SDValue IsNaN =
- DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
+ SDValue NaNSrc = IsNum ? MinMax : NewX;
+ SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NaNSrc, NaNSrc, ISD::SETUO);
return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
}
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 83bd6ac..1b748b7 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -5519,7 +5519,7 @@ defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86any_fmin, X86fmins, X86fminSAEs,
defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86any_fmax, X86fmaxs, X86fmaxSAEs,
SchedWriteFCmpSizes, 0>;
-// MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use
+// MIN/MAX nodes are commutable under (nnan + ninf). In this case we use
// X86fminc and X86fmaxc instead of X86fmin and X86fmax
multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
X86VectorVTInfo _, SDNode OpNode,
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index cc30054..ac4d31d 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -15,7 +15,7 @@
//===----------------------------------------------------------------------===//
def Znver4Model : SchedMachineModel {
- // AMD SOG Zen4, 2.9.6 Dispatch
+ // AMD SOG Zen4, 2.9.8 Dispatch
// The processor may dispatch up to 6 macro ops per cycle
// into the execution engine.
let IssueWidth = 6;
@@ -46,8 +46,9 @@ def Znver4Model : SchedMachineModel {
int VecLoadLatency = 7;
// Latency of a simple store operation.
int StoreLatency = 1;
- // FIXME:
- let HighLatency = 25; // FIXME: any better choice?
+ // Mean and median value for all instructions with latencies >6
+ // Source: Zen4 Instruction Latencies spreadsheet (included with SOG)
+ let HighLatency = 13;
// AMD SOG Zen4, 2.8 Optimizing Branching
// The branch misprediction penalty is in the range from 11 to 18 cycles,
// <...>. The common case penalty is 13 cycles.
@@ -612,6 +613,7 @@ def Zn4WriteLEA : SchedWriteVariant<[
def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
+// values from uops.info
def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> {
let Latency = 2; // FIXME: not from llvm-exegesis
let ReleaseAtCycles = [4];
@@ -659,15 +661,15 @@ def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>;
def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> {
let Latency = 3; // FIXME: not from llvm-exegesis
- let ReleaseAtCycles = [24];
- let NumMicroOps = 19;
+ let ReleaseAtCycles = [20];
+ let NumMicroOps = 15;
}
def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> {
- let Latency = 4; // FIXME: not from llvm-exegesis
- let ReleaseAtCycles = [59];
- let NumMicroOps = 28;
+ let Latency = 2; // FIXME: not from llvm-exegesis
+ let ReleaseAtCycles = [40];
+ let NumMicroOps = 26;
}
def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>;
@@ -681,7 +683,7 @@ def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16a
def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
let ReleaseAtCycles = [1, 1, 2];
- let NumMicroOps = 5;
+ let NumMicroOps = 2;
}
def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>;
@@ -693,19 +695,17 @@ def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]>
def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>;
// Integer division.
-// FIXME: uops for 8-bit division measures as 2. for others it's a guess.
-// FIXME: latency for 8-bit division measures as 10. for others it's a guess.
-defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 10, [10], 2>;
-defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 11, [11], 2>;
-defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 13, [13], 2>;
-defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 17, [17], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 10, [10], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 11, [11], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 13, [13], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 17, [17], 2>;
-
-defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward.
-defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse.
+defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 9, [9], 2>;
+defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 10, [10], 2>;
+defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 12, [12], 2>;
+defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 18, [18], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 9, [9], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 10, [10], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 12, [12], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 18, [18], 2>;
+
+defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 1, /*LoadUOps=*/1>; // Bit scan forward.
+defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 1, /*LoadUOps=*/1>; // Bit scan reverse.
defm : Zn4WriteResIntPair<WritePOPCNT, [Zn4ALU0123], 1, [1], 1>; // Bit population count.
@@ -725,12 +725,12 @@ def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
}
def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>;
-defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 2, [1], 2>; // Trailing zero count.
+defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 1, [1], 1>; // Trailing zero count.
def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
- let Latency = 2;
- let ReleaseAtCycles = [4];
- let NumMicroOps = 2;
+ let Latency = 1;
+ let ReleaseAtCycles = [1];
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>;
@@ -1109,15 +1109,31 @@ def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> {
}
def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>;
-def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
- // TODO: All align instructions are expected to be of 4 cycle latency
- let Latency = 4;
+// 128-bit VALIGN
+def Zn4WriteXMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
+ let Latency = 2;
let ReleaseAtCycles = [1];
let NumMicroOps = 1;
}
-def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri,
- VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri)
- >;
+
+// 256-bit VALIGN
+def Zn4WriteYMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
+ let Latency = 3;
+ let ReleaseAtCycles = [1];
+ let NumMicroOps = 1;
+}
+
+// 512-bit VALIGN
+def Zn4WriteZMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
+ let Latency = 4;
+ let ReleaseAtCycles = [2];
+ let NumMicroOps = 1;
+}
+
+def : InstRW<[Zn4WriteXMMVecALU2Slow], (instrs VALIGNDZrri, VALIGNQZrri)>;
+def : InstRW<[Zn4WriteYMMVecALU2Slow], (instrs VALIGNDZ128rri, VALIGNQZ128rri)>;
+def : InstRW<[Zn4WriteZMMVecALU2Slow], (instrs VALIGNDZ256rri, VALIGNQZ256rri)>;
+
defm : Zn4WriteResYMMPair<WriteVecALUY, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).
def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> {
@@ -1326,9 +1342,9 @@ def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>;
// Strings instructions.
// Packed Compare Implicit Length Strings, Return Mask
-defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>;
+defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 7, [8], 3, /*LoadUOps=*/1>;
// Packed Compare Explicit Length Strings, Return Mask
-defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>;
+defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 7, [12], 7, /*LoadUOps=*/5>;
// Packed Compare Implicit Length Strings, Return Index
defm : Zn4WriteResXMMPair<WritePCmpIStrI, [Zn4FPVAdd0123], 2, [8], 4>;
// Packed Compare Explicit Length Strings, Return Index
@@ -1340,7 +1356,7 @@ defm : Zn4WriteResXMMPair<WriteAESIMC, [Zn4FPAES01], 4, [1], 1>; // InvMixColumn
defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Generation.
// Carry-less multiplication instructions.
-defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4], 4>;
+defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [3], 4>;
// EMMS/FEMMS
defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
@@ -1386,44 +1402,44 @@ def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rmi)>;
def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> {
- let Latency = 7;
+ let Latency = 4;
let ReleaseAtCycles = [1];
- let NumMicroOps = 2;
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>;
def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMPSYrr.Latency);
- let ReleaseAtCycles = [1, 1, 2];
- let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1);
+ let ReleaseAtCycles = [1, 1, 1];
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>;
def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> {
- let Latency = 6;
+ let Latency = 4;
let ReleaseAtCycles = [1];
- let NumMicroOps = 2;
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMYri.Latency);
- let ReleaseAtCycles = [1, 1, 2];
- let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1);
+ let ReleaseAtCycles = [1, 1, 1];
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> {
- let Latency = 5;
+ let Latency = 4;
let ReleaseAtCycles = [1];
- let NumMicroOps = 2;
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>;
def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMDYrr.Latency);
- let ReleaseAtCycles = [1, 1, 2];
- let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0);
+ let ReleaseAtCycles = [1, 1, 1];
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>;