diff options
Diffstat (limited to 'llvm/lib/Target')
39 files changed, 771 insertions, 282 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 662d84b..a81de5c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -27602,6 +27602,15 @@ static SDValue performPTestFirstCombine(SDNode *N, static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { + SDLoc DL(N); + + // If a DUP(Op0) already exists, reuse it for the scalar_to_vector. + if (DCI.isAfterLegalizeDAG()) { + if (SDNode *LN = DCI.DAG.getNodeIfExists(AArch64ISD::DUP, N->getVTList(), + N->getOperand(0))) + return SDValue(LN, 0); + } + // Let's do below transform. // // t34: v4i32 = AArch64ISD::UADDLV t2 @@ -27638,7 +27647,6 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return SDValue(); // Let's generate new sequence with AArch64ISD::NVCAST. - SDLoc DL(N); SDValue EXTRACT_SUBVEC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV, DAG.getConstant(0, DL, MVT::i64)); diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td index 31fcd63..5d9215d 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td @@ -136,8 +136,8 @@ def : Pat<(f32 (bitconvert (i32 (relaxed_load<atomic_load_nonext_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend))))), (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>; def : Pat<(f32 (bitconvert (i32 (relaxed_load<atomic_load_nonext_32> - (am_indexed32 GPR64sp:$Rn, uimm12s8:$offset))))), - (LDRSui GPR64sp:$Rn, uimm12s8:$offset)>; + (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), + (LDRSui GPR64sp:$Rn, uimm12s4:$offset)>; def : Pat<(f32 (bitconvert (i32 (relaxed_load<atomic_load_nonext_32> (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))), (LDURSi GPR64sp:$Rn, simm9:$offset)>; @@ -236,11 +236,11 @@ def : Pat<(relaxed_store<atomic_store_32> def : Pat<(releasing_store<atomic_store_64> GPR64sp:$ptr, GPR64:$val), (STLRX GPR64:$val, GPR64sp:$ptr)>; def : Pat<(relaxed_store<atomic_store_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm, - ro_Wextend16:$extend), + ro_Wextend64:$extend), GPR64:$val), (STRXroW GPR64:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>; def : Pat<(relaxed_store<atomic_store_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm, - ro_Xextend16:$extend), + ro_Xextend64:$extend), GPR64:$val), (STRXroX GPR64:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>; def : Pat<(relaxed_store<atomic_store_64> @@ -276,8 +276,8 @@ def : Pat<(relaxed_store<atomic_store_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm, (i64 (bitconvert (f64 FPR64Op:$val)))), (STRDroX FPR64Op:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>; def : Pat<(relaxed_store<atomic_store_64> - (am_indexed64 GPR64sp:$Rn, uimm12s4:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))), - (STRDui FPR64Op:$val, GPR64sp:$Rn, uimm12s4:$offset)>; + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))), + (STRDui FPR64Op:$val, GPR64sp:$Rn, uimm12s8:$offset)>; def : Pat<(relaxed_store<atomic_store_64> (am_unscaled64 GPR64sp:$Rn, simm9:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))), (STURDi FPR64Op:$val, GPR64sp:$Rn, simm9:$offset)>; diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td index fe84193..30b7b03 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td +++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td @@ -507,7 +507,7 @@ let AddedComplexity = 19 in { defm : VecROStoreLane64_0Pat<ro32, store, v2i32, i32, ssub, STRSroW, STRSroX>; } -def : Pat<(v8i8 (AArch64dup (i8 (load (am_indexed8 GPR64sp:$Rn))))), +def : Pat<(v8i8 (AArch64dup (i8 (load GPR64sp:$Rn)))), (LD1Rv8b GPR64sp:$Rn)>; def : Pat<(v16i8 (AArch64dup (i8 (load GPR64sp:$Rn)))), (LD1Rv16b GPR64sp:$Rn)>; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index ef974df..47144c7 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -993,7 +993,7 @@ def PPR_3b : PPRClass<0, 7> { // Restricted 3 bit SVE predicate register class. let DecoderMethod = "DecodeSimpleRegisterClass<AArch64::PPRRegClassID, 0, 8>"; } def PPR_p8to15 : PPRClass<8, 15> { - let DecoderMethod = "DecodeSimpleRegisterClass<AArch64::PNRRegClassID, 8, 8>"; + let DecoderMethod = "DecodeSimpleRegisterClass<AArch64::PPRRegClassID, 8, 8>"; } def PPRMul2 : PPRClass<0, 14, 2>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp new file mode 100644 index 0000000..30a1f05 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp @@ -0,0 +1,73 @@ +//===--- AMDGPUBarrierLatency.cpp - AMDGPU Barrier Latency ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains a DAG scheduling mutation to add latency to +/// barrier edges between ATOMIC_FENCE instructions and preceding +/// memory accesses potentially affected by the fence. +/// This encourages the scheduling of more instructions before +/// ATOMIC_FENCE instructions. ATOMIC_FENCE instructions may +/// introduce wait counting or indicate an impending S_BARRIER +/// wait. Having more instructions in-flight across these +/// constructs improves latency hiding. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUBarrierLatency.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" + +using namespace llvm; + +namespace { + +class BarrierLatency : public ScheduleDAGMutation { +public: + BarrierLatency() = default; + void apply(ScheduleDAGInstrs *DAG) override; +}; + +void BarrierLatency::apply(ScheduleDAGInstrs *DAG) { + constexpr unsigned SyntheticLatency = 2000; + for (SUnit &SU : DAG->SUnits) { + const MachineInstr *MI = SU.getInstr(); + if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) + continue; + + // Update latency on barrier edges of ATOMIC_FENCE. + // We don't consider the scope of the fence or type of instruction + // involved in the barrier edge. + for (SDep &PredDep : SU.Preds) { + if (!PredDep.isBarrier()) + continue; + SUnit *PredSU = PredDep.getSUnit(); + MachineInstr *MI = PredSU->getInstr(); + // Only consider memory loads + if (!MI->mayLoad() || MI->mayStore()) + continue; + SDep ForwardD = PredDep; + ForwardD.setSUnit(&SU); + for (SDep &SuccDep : PredSU->Succs) { + if (SuccDep == ForwardD) { + SuccDep.setLatency(SuccDep.getLatency() + SyntheticLatency); + break; + } + } + PredDep.setLatency(PredDep.getLatency() + SyntheticLatency); + PredSU->setDepthDirty(); + SU.setDepthDirty(); + } + } +} + +} // end namespace + +std::unique_ptr<ScheduleDAGMutation> +llvm::createAMDGPUBarrierLatencyDAGMutation() { + return std::make_unique<BarrierLatency>(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h new file mode 100644 index 0000000..c23f0b9 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h @@ -0,0 +1,21 @@ +//===- AMDGPUBarrierLatency.h - AMDGPU Export Clustering --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H + +#include "llvm/CodeGen/ScheduleDAGMutation.h" +#include <memory> + +namespace llvm { + +std::unique_ptr<ScheduleDAGMutation> createAMDGPUBarrierLatencyDAGMutation(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 4958a20..996b55f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -17,6 +17,7 @@ #include "AMDGPUTargetMachine.h" #include "AMDGPU.h" #include "AMDGPUAliasAnalysis.h" +#include "AMDGPUBarrierLatency.h" #include "AMDGPUCtorDtorLowering.h" #include "AMDGPUExportClustering.h" #include "AMDGPUExportKernelRuntimeHandles.h" @@ -639,6 +640,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); return DAG; } @@ -659,6 +661,7 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) { if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); return DAG; } @@ -1197,6 +1200,7 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const { EnableVOPD) DAG->addMutation(createVOPDPairingMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); return DAG; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 13f727b68..a1e0e52 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -52,6 +52,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUAsmPrinter.cpp AMDGPUAtomicOptimizer.cpp AMDGPUAttributor.cpp + AMDGPUBarrierLatency.cpp AMDGPUCallLowering.cpp AMDGPUCodeGenPrepare.cpp AMDGPUCombinerHelper.cpp diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 74d4153..6f1feb1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2223,8 +2223,8 @@ def : GCNPat < def : GCNPat < (DivergentUnaryFrag<fneg> (v2f32 VReg_64:$src)), - (V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src, - 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, (i64 0), + (V_PK_ADD_F32 !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), VReg_64:$src, + !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), (i64 0), 0, 0, 0, 0, 0) > { let SubtargetPredicate = HasPackedFP32Ops; diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index bdbc000..07264d9 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -397,12 +397,6 @@ public: bool IsCrossAddrSpaceOrdering, Position Pos) const = 0; - /// Inserts any necessary instructions before the barrier start instruction - /// \p MI in order to support pairing of barriers and fences. - virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const { - return false; - }; - /// Virtual destructor to allow derivations to be deleted. virtual ~SICacheControl() = default; }; @@ -583,12 +577,8 @@ public: bool IsCrossAddrSpaceOrdering, Position Pos, AtomicOrdering Order, bool AtomicsOnly) const override; - bool insertAcquire(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const override; - - bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override; + bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, Position Pos) const override; }; class SIGfx11CacheControl : public SIGfx10CacheControl { @@ -2069,8 +2059,11 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, // the WGP. Therefore need to wait for operations to complete to ensure // they are visible to waves in the other CU as the L0 is per CU. // Otherwise in CU mode and all waves of a work-group are on the same CU - // which shares the same L0. - if (!ST.isCuModeEnabled()) { + // which shares the same L0. Note that we still need to wait when + // performing a release in this mode to respect the transitivity of + // happens-before, e.g. other waves of the workgroup must be able to + // release the memory from another wave at a wider scope. + if (!ST.isCuModeEnabled() || isReleaseOrStronger(Order)) { if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) VMCnt |= true; if ((Op & SIMemOp::STORE) != SIMemOp::NONE) @@ -2225,22 +2218,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, return Changed; } -bool SIGfx10CacheControl::insertBarrierStart( - MachineBasicBlock::iterator &MI) const { - // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU - // mode. This is because a CU mode release fence does not emit any wait, which - // is fine when only dealing with vmem, but isn't sufficient in the presence - // of barriers which do not go through vmem. - // GFX12.5 does not require this additional wait. - if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) - return false; - - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), - TII->get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); - return true; -} - bool SIGfx11CacheControl::enableLoadCacheBypass( const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { @@ -2419,15 +2396,20 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, // In WGP mode the waves of a work-group can be executing on either CU // of the WGP. Therefore need to wait for operations to complete to // ensure they are visible to waves in the other CU as the L0 is per CU. + // // Otherwise in CU mode and all waves of a work-group are on the same CU - // which shares the same L0. + // which shares the same L0. Note that we still need to wait when + // performing a release in this mode to respect the transitivity of + // happens-before, e.g. other waves of the workgroup must be able to + // release the memory from another wave at a wider scope. // // GFX12.5: // CU$ has two ports. To ensure operations are visible at the workgroup // level, we need to ensure all operations in this port have completed // so the other SIMDs in the WG can see them. There is no ordering // guarantee between the ports. - if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) { + if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts() || + isReleaseOrStronger(Order)) { if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) LOADCnt |= true; if ((Op & SIMemOp::STORE) != SIMemOp::NONE) @@ -3017,11 +2999,6 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) { MI = II->getIterator(); } - if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) { - Changed |= CC->insertBarrierStart(MI); - continue; - } - if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) continue; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 42ec8ba..7cce033 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -775,10 +775,10 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in { } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 let SubtargetPredicate = HasAddMinMaxInsts, isCommutable = 1, isReMaterializable = 1 in { - defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP_I32_I32_I32_I32>; - defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP_I32_I32_I32_I32>; - defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP_I32_I32_I32_I32>; - defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP_I32_I32_I32_I32>; + defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; + defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; + defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; + defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; } defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>; diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp index 1f773e2..3368a50 100644 --- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp @@ -820,7 +820,7 @@ void ARMAsmPrinter::emitAttributes() { auto *BTIValue = mdconst::extract_or_null<ConstantInt>( SourceModule->getModuleFlag("branch-target-enforcement")); - if (BTIValue && BTIValue->isOne()) { + if (BTIValue && !BTIValue->isZero()) { // If "+pacbti" is used as an architecture extension, // Tag_BTI_extension is emitted in // ARMTargetStreamer::emitTargetAttributes(). diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 35e1127..b1a668e 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1089,7 +1089,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, // Register based DivRem for AEABI (RTABI 4.2) if (TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() || - TT.isTargetMuslAEABI() || TT.isOSWindows()) { + TT.isTargetMuslAEABI() || TT.isOSFuchsia() || TT.isOSWindows()) { setOperationAction(ISD::SREM, MVT::i64, Custom); setOperationAction(ISD::UREM, MVT::i64, Custom); HasStandaloneRem = false; @@ -1353,6 +1353,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::FLOG10, MVT::f16, Promote); setOperationAction(ISD::FLOG2, MVT::f16, Promote); setOperationAction(ISD::LRINT, MVT::f16, Expand); + setOperationAction(ISD::LROUND, MVT::f16, Expand); setOperationAction(ISD::FROUND, MVT::f16, Legal); setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal); @@ -20574,7 +20575,7 @@ static TargetLowering::ArgListTy getDivRemArgList( SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || - Subtarget->isTargetWindows()) && + Subtarget->isTargetFuchsia() || Subtarget->isTargetWindows()) && "Register-based DivRem lowering only"); unsigned Opcode = Op->getOpcode(); assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index 96ee69c..597d311 100644 --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -882,7 +882,7 @@ static bool producesFalseLanesZero(MachineInstr &MI, continue; // Skip the lr predicate reg int PIdx = llvm::findFirstVPTPredOperandIdx(MI); - if (PIdx != -1 && (int)MO.getOperandNo() == PIdx + 2) + if (PIdx != -1 && MO.getOperandNo() == PIdx + ARM::SUBOP_vpred_n_tp_reg) continue; // Check that this instruction will produce zeros in its false lanes: @@ -1036,6 +1036,7 @@ bool LowOverheadLoop::ValidateLiveOuts() { while (!Worklist.empty()) { MachineInstr *MI = Worklist.pop_back_val(); if (MI->getOpcode() == ARM::MQPRCopy) { + LLVM_DEBUG(dbgs() << " Must generate copy as VMOV: " << *MI); VMOVCopies.insert(MI); MachineInstr *CopySrc = RDI.getUniqueReachingMIDef(MI, MI->getOperand(1).getReg()); @@ -1045,6 +1046,20 @@ bool LowOverheadLoop::ValidateLiveOuts() { LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI); VMOVCopies.clear(); return false; + } else if (isVectorPredicated(MI)) { + // If this is a predicated instruction with merging semantics, + // check where it gets its false lanes from, if any. + int InactiveIdx = findVPTInactiveOperandIdx(*MI); + if (InactiveIdx != -1) { + SmallPtrSet<MachineInstr *, 2> Defs; + MachineInstr *FalseSrc = RDI.getUniqueReachingMIDef( + MI, MI->getOperand(InactiveIdx).getReg()); + if (FalseSrc) { + LLVM_DEBUG(dbgs() + << " Must check source of false lanes for: " << *MI); + Worklist.push_back(FalseSrc); + } + } } } diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h index b2d368e..4a0883c 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -343,6 +343,7 @@ public: bool isTargetWatchOS() const { return TargetTriple.isWatchOS(); } bool isTargetWatchABI() const { return TargetTriple.isWatchABI(); } bool isTargetDriverKit() const { return TargetTriple.isDriverKit(); } + bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); } bool isTargetLinux() const { return TargetTriple.isOSLinux(); } bool isTargetNetBSD() const { return TargetTriple.isOSNetBSD(); } bool isTargetWindows() const { return TargetTriple.isOSWindows(); } diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp index 5eeb4fe..413e844 100644 --- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp +++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp @@ -534,7 +534,7 @@ bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML, Register LR = LoopPhi->getOperand(0).getReg(); for (MachineInstr *MI : MVEInstrs) { int Idx = findFirstVPTPredOperandIdx(*MI); - MI->getOperand(Idx + 2).setReg(LR); + MI->getOperand(Idx + ARM::SUBOP_vpred_n_tp_reg).setReg(LR); } } diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp index 431ce38..f5653d4 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -805,6 +805,16 @@ int llvm::findFirstVPTPredOperandIdx(const MachineInstr &MI) { return -1; } +int llvm::findVPTInactiveOperandIdx(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + + for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) + if (MCID.operands()[i].OperandType == ARM::OPERAND_VPRED_R) + return i + ARM::SUBOP_vpred_r_inactive; + + return -1; +} + ARMVCC::VPTCodes llvm::getVPTInstrPredicate(const MachineInstr &MI, Register &PredReg) { int PIdx = findFirstVPTPredOperandIdx(MI); diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h index 3ec3a621..1b0bf2d 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h @@ -90,6 +90,9 @@ inline ARMVCC::VPTCodes getVPTInstrPredicate(const MachineInstr &MI) { Register PredReg; return getVPTInstrPredicate(MI, PredReg); } +// Identify the input operand in an MVE predicated instruction which +// contributes the values of any inactive vector lanes. +int findVPTInactiveOperandIdx(const MachineInstr &MI); // Recomputes the Block Mask of Instr, a VPT or VPST instruction. // This rebuilds the block mask of the instruction depending on the predicates diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp index c8866bf..42e90f0 100644 --- a/llvm/lib/Target/DirectX/DXILPrepare.cpp +++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp @@ -294,6 +294,14 @@ public: if (NamedMDNode *RootSignature = M.getNamedMetadata("dx.rootsignatures")) RootSignature->eraseFromParent(); + // llvm.errno.tbaa was recently added but is not supported in LLVM 3.7 and + // causes all tests using the DXIL Validator to fail. + // + // This is a temporary fix and should be replaced with a whitelist once + // we have determined all metadata that the DXIL Validator allows + if (NamedMDNode *ErrNo = M.getNamedMetadata("llvm.errno.tbaa")) + ErrNo->eraseFromParent(); + return true; } diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index a94e131..54c8972 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -117,8 +117,10 @@ HexagonTargetLowering::initializeHVXLowering() { setOperationAction(ISD::VECTOR_SHUFFLE, ByteW, Legal); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - if (Subtarget.useHVX128BOps()) + if (Subtarget.useHVX128BOps()) { setOperationAction(ISD::BITCAST, MVT::v32i1, Custom); + setOperationAction(ISD::BITCAST, MVT::v64i1, Custom); + } if (Subtarget.useHVX128BOps() && Subtarget.useHVXV68Ops() && Subtarget.useHVXFloatingPoint()) { @@ -2024,13 +2026,9 @@ HexagonTargetLowering::LowerHvxBitcast(SDValue Op, SelectionDAG &DAG) const { // Handle bitcast from i32, v2i16, and v4i8 to v32i1. // Splat the input into a 32-element i32 vector, then AND each element // with a unique bitmask to isolate individual bits. - if (ResTy == MVT::v32i1 && - (ValTy == MVT::i32 || ValTy == MVT::v2i16 || ValTy == MVT::v4i8) && - Subtarget.useHVX128BOps()) { - SDValue Val32 = Val; - if (ValTy == MVT::v2i16 || ValTy == MVT::v4i8) - Val32 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Val); - + auto bitcastI32ToV32I1 = [&](SDValue Val32) { + assert(Val32.getValueType().getSizeInBits() == 32 && + "Input must be 32 bits"); MVT VecTy = MVT::getVectorVT(MVT::i32, 32); SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, dl, VecTy, Val32); SmallVector<SDValue, 32> Mask; @@ -2039,7 +2037,31 @@ HexagonTargetLowering::LowerHvxBitcast(SDValue Op, SelectionDAG &DAG) const { SDValue MaskVec = DAG.getBuildVector(VecTy, dl, Mask); SDValue Anded = DAG.getNode(ISD::AND, dl, VecTy, Splat, MaskVec); - return DAG.getNode(HexagonISD::V2Q, dl, ResTy, Anded); + return DAG.getNode(HexagonISD::V2Q, dl, MVT::v32i1, Anded); + }; + // === Case: v32i1 === + if (ResTy == MVT::v32i1 && + (ValTy == MVT::i32 || ValTy == MVT::v2i16 || ValTy == MVT::v4i8) && + Subtarget.useHVX128BOps()) { + SDValue Val32 = Val; + if (ValTy == MVT::v2i16 || ValTy == MVT::v4i8) + Val32 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Val); + return bitcastI32ToV32I1(Val32); + } + // === Case: v64i1 === + if (ResTy == MVT::v64i1 && ValTy == MVT::i64 && Subtarget.useHVX128BOps()) { + // Split i64 into lo/hi 32-bit halves. + SDValue Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Val); + SDValue HiShifted = DAG.getNode(ISD::SRL, dl, MVT::i64, Val, + DAG.getConstant(32, dl, MVT::i64)); + SDValue Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, HiShifted); + + // Reuse the same 32-bit logic twice. + SDValue LoRes = bitcastI32ToV32I1(Lo); + SDValue HiRes = bitcastI32ToV32I1(Hi); + + // Concatenate into a v64i1 predicate. + return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, LoRes, HiRes); } if (isHvxBoolTy(ResTy) && ValTy.isScalarInteger()) { diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 8bf0d11..17f04d0 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -442,14 +442,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); // If we're enabling GP optimizations, use hardware square root - if (!Subtarget.hasFSQRT() && - !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() && - Subtarget.hasFRE())) + if (!Subtarget.hasFSQRT() && !(Subtarget.hasFRSQRTE() && Subtarget.hasFRE())) setOperationAction(ISD::FSQRT, MVT::f64, Expand); if (!Subtarget.hasFSQRT() && - !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() && - Subtarget.hasFRES())) + !(Subtarget.hasFRSQRTES() && Subtarget.hasFRES())) setOperationAction(ISD::FSQRT, MVT::f32, Expand); if (Subtarget.hasFCPSGN()) { @@ -565,16 +562,15 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::BITCAST, MVT::i32, Legal); setOperationAction(ISD::BITCAST, MVT::i64, Legal); setOperationAction(ISD::BITCAST, MVT::f64, Legal); - if (TM.Options.UnsafeFPMath) { - setOperationAction(ISD::LRINT, MVT::f64, Legal); - setOperationAction(ISD::LRINT, MVT::f32, Legal); - setOperationAction(ISD::LLRINT, MVT::f64, Legal); - setOperationAction(ISD::LLRINT, MVT::f32, Legal); - setOperationAction(ISD::LROUND, MVT::f64, Legal); - setOperationAction(ISD::LROUND, MVT::f32, Legal); - setOperationAction(ISD::LLROUND, MVT::f64, Legal); - setOperationAction(ISD::LLROUND, MVT::f32, Legal); - } + + setOperationAction(ISD::STRICT_LRINT, MVT::f64, Custom); + setOperationAction(ISD::STRICT_LRINT, MVT::f32, Custom); + setOperationAction(ISD::STRICT_LLRINT, MVT::f64, Custom); + setOperationAction(ISD::STRICT_LLRINT, MVT::f32, Custom); + setOperationAction(ISD::STRICT_LROUND, MVT::f64, Custom); + setOperationAction(ISD::STRICT_LROUND, MVT::f32, Custom); + setOperationAction(ISD::STRICT_LLROUND, MVT::f64, Custom); + setOperationAction(ISD::STRICT_LLROUND, MVT::f32, Custom); } else { setOperationAction(ISD::BITCAST, MVT::f32, Expand); setOperationAction(ISD::BITCAST, MVT::i32, Expand); @@ -1034,11 +1030,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); // The nearbyint variants are not allowed to raise the inexact exception - // so we can only code-gen them with unsafe math. - if (TM.Options.UnsafeFPMath) { - setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); - } + // so we can only code-gen them with fpexcept.ignore. + setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f32, Custom); setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); @@ -8911,11 +8905,13 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, // be lost at this stage, but is below the single-precision rounding // position. // - // However, if -enable-unsafe-fp-math is in effect, accept double + // However, if afn is in effect, accept double // rounding to avoid the extra overhead. - if (Op.getValueType() == MVT::f32 && - !Subtarget.hasFPCVT() && - !DAG.getTarget().Options.UnsafeFPMath) { + // FIXME: Currently INT_TO_FP can't support fast math flags because + // of nneg flag, thus Op->getFlags().hasApproximateFuncs() is always + // false. + if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT() && + !Op->getFlags().hasApproximateFuncs()) { // Twiddle input to make sure the low 11 bits are zero. (If this // is the case, we are guaranteed the value will fit into the 53 bit @@ -12759,6 +12755,14 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerADDSUBO_CARRY(Op, DAG); case ISD::UCMP: return LowerUCMP(Op, DAG); + case ISD::STRICT_LRINT: + case ISD::STRICT_LLRINT: + case ISD::STRICT_LROUND: + case ISD::STRICT_LLROUND: + case ISD::STRICT_FNEARBYINT: + if (Op->getFlags().hasNoFPExcept()) + return Op; + return SDValue(); } } @@ -13088,7 +13092,9 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB, BuildMI(BB, dl, TII->get(StoreMnemonic)) .addReg(TmpReg).addReg(ptrA).addReg(ptrB); BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB); + .addImm(PPC::PRED_NE_MINUS) + .addReg(PPC::CR0) + .addMBB(loopMBB); BB->addSuccessor(loopMBB); BB->addSuccessor(exitMBB); @@ -13346,7 +13352,7 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary( .addReg(ZeroReg) .addReg(PtrReg); BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(PPC::PRED_NE) + .addImm(PPC::PRED_NE_MINUS) .addReg(PPC::CR0) .addMBB(loopMBB); BB->addSuccessor(loopMBB); @@ -14177,7 +14183,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .addReg(dest) .addReg(oldval); BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(PPC::PRED_NE) + .addImm(PPC::PRED_NE_MINUS) .addReg(CrReg) .addMBB(exitMBB); BB->addSuccessor(loop2MBB); @@ -14189,7 +14195,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .addReg(ptrA) .addReg(ptrB); BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(PPC::PRED_NE) + .addImm(PPC::PRED_NE_MINUS) .addReg(PPC::CR0) .addMBB(loop1MBB); BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); @@ -14730,8 +14736,8 @@ SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG, } unsigned PPCTargetLowering::combineRepeatedFPDivisors() const { - // Note: This functionality is used only when unsafe-fp-math is enabled, and - // on cores with reciprocal estimates (which are used when unsafe-fp-math is + // Note: This functionality is used only when arcp is enabled, and + // on cores with reciprocal estimates (which are used when arcp is // enabled for division), this functionality is redundant with the default // combiner logic (once the division -> reciprocal/multiply transformation // has taken place). As a result, this matters more for older cores than for @@ -18707,11 +18713,12 @@ bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const { const Function *F = I->getFunction(); const DataLayout &DL = F->getDataLayout(); Type *Ty = User->getOperand(0)->getType(); + bool AllowContract = I->getFastMathFlags().allowContract() && + User->getFastMathFlags().allowContract(); - return !( - isFMAFasterThanFMulAndFAdd(*F, Ty) && - isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) && - (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath)); + return !(isFMAFasterThanFMulAndFAdd(*F, Ty) && + isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) && + (AllowContract || Options.AllowFPOpFusion == FPOpFusion::Fast)); } case Instruction::Load: { // Don't break "store (load float*)" pattern, this pattern will be combined diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td index 979ba31..885bed6 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -2786,14 +2786,16 @@ def : Pat<(v2f64 (any_frint v2f64:$S)), (v2f64 (XVRDPIC $S))>; // Rounding without exceptions (nearbyint). Due to strange tblgen behaviour, // these need to be defined after the any_frint versions so ISEL will correctly // add the chain to the strict versions. -def : Pat<(f32 (fnearbyint f32:$S)), +// TODO: Match strict fp rounding intrinsics with instructions like xsrdpiz when +// rounding mode is propagated to CodeGen part. +def : Pat<(f32 (strict_fnearbyint f32:$S)), (f32 (COPY_TO_REGCLASS (XSRDPIC (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>; -def : Pat<(f64 (fnearbyint f64:$S)), +def : Pat<(f64 (strict_fnearbyint f64:$S)), (f64 (XSRDPIC $S))>; -def : Pat<(v2f64 (fnearbyint v2f64:$S)), +def : Pat<(v2f64 (strict_fnearbyint v2f64:$S)), (v2f64 (XVRDPIC $S))>; -def : Pat<(v4f32 (fnearbyint v4f32:$S)), +def : Pat<(v4f32 (strict_fnearbyint v4f32:$S)), (v4f32 (XVRSPIC $S))>; // Materialize a zero-vector of long long @@ -3578,25 +3580,25 @@ def : Pat<(f64 (bitconvert i64:$S)), (f64 (MTVSRD $S))>; // Rounding to integer. -def : Pat<(i64 (lrint f64:$S)), +def : Pat<(i64 (strict_lrint f64:$S)), (i64 (MFVSRD (FCTID $S)))>; -def : Pat<(i64 (lrint f32:$S)), +def : Pat<(i64 (strict_lrint f32:$S)), (i64 (MFVSRD (FCTID (COPY_TO_REGCLASS $S, F8RC))))>; -def : Pat<(i64 (llrint f64:$S)), +def : Pat<(i64 (strict_llrint f64:$S)), (i64 (MFVSRD (FCTID $S)))>; -def : Pat<(i64 (llrint f32:$S)), +def : Pat<(i64 (strict_llrint f32:$S)), (i64 (MFVSRD (FCTID (COPY_TO_REGCLASS $S, F8RC))))>; -def : Pat<(i64 (lround f64:$S)), +def : Pat<(i64 (strict_lround f64:$S)), (i64 (MFVSRD (FCTID (XSRDPI $S))))>; -def : Pat<(i64 (lround f32:$S)), +def : Pat<(i64 (strict_lround f32:$S)), (i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>; -def : Pat<(i32 (lround f64:$S)), +def : Pat<(i32 (strict_lround f64:$S)), (i32 (MFVSRWZ (FCTIW (XSRDPI $S))))>; -def : Pat<(i32 (lround f32:$S)), +def : Pat<(i32 (strict_lround f32:$S)), (i32 (MFVSRWZ (FCTIW (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>; -def : Pat<(i64 (llround f64:$S)), +def : Pat<(i64 (strict_llround f64:$S)), (i64 (MFVSRD (FCTID (XSRDPI $S))))>; -def : Pat<(i64 (llround f32:$S)), +def : Pat<(i64 (strict_llround f32:$S)), (i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>; // Alternate patterns for PPCmtvsrz where the output is v8i16 or v16i8 instead diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index e857b2d..edde7ac 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2406,7 +2406,8 @@ ParseStatus RISCVAsmParser::parseVTypeI(OperandVector &Operands) { } bool RISCVAsmParser::generateVTypeError(SMLoc ErrorLoc) { - if (STI->hasFeature(RISCV::FeatureStdExtZvfbfa)) + if (STI->hasFeature(RISCV::FeatureStdExtZvfbfa) || + STI->hasFeature(RISCV::FeatureVendorXSfvfbfexp16e)) return Error( ErrorLoc, "operand must be " diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index b8ec0bb..4bea4c4 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -654,7 +654,10 @@ static constexpr FeatureBitset XqciFeatureGroup = { static constexpr FeatureBitset XSfVectorGroup = { RISCV::FeatureVendorXSfvcp, RISCV::FeatureVendorXSfvqmaccdod, RISCV::FeatureVendorXSfvqmaccqoq, RISCV::FeatureVendorXSfvfwmaccqqq, - RISCV::FeatureVendorXSfvfnrclipxfqf, RISCV::FeatureVendorXSfmmbase}; + RISCV::FeatureVendorXSfvfnrclipxfqf, RISCV::FeatureVendorXSfmmbase, + RISCV::FeatureVendorXSfvfexpa, RISCV::FeatureVendorXSfvfexpa64e, + RISCV::FeatureVendorXSfvfbfexp16e, RISCV::FeatureVendorXSfvfexp16e, + RISCV::FeatureVendorXSfvfexp32e}; static constexpr FeatureBitset XSfSystemGroup = { RISCV::FeatureVendorXSiFivecdiscarddlone, RISCV::FeatureVendorXSiFivecflushdlone, diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp index 50f5a5d..7b9c4b3 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp @@ -220,7 +220,8 @@ void RISCVInstPrinter::printVTypeI(const MCInst *MI, unsigned OpNo, if (RISCVVType::getVLMUL(Imm) == RISCVVType::VLMUL::LMUL_RESERVED || RISCVVType::getSEW(Imm) > 64 || (RISCVVType::isAltFmt(Imm) && - !STI.hasFeature(RISCV::FeatureStdExtZvfbfa)) || + !(STI.hasFeature(RISCV::FeatureStdExtZvfbfa) || + STI.hasFeature(RISCV::FeatureVendorXSfvfbfexp16e))) || (Imm >> 9) != 0) { O << formatImm(Imm); return; diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp index 5dd4bf4..98b636e 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp @@ -109,12 +109,70 @@ bool RISCVExpandAtomicPseudo::expandMI(MachineBasicBlock &MBB, // expanded instructions for each pseudo is correct in the Size field of the // tablegen definition for the pseudo. switch (MBBI->getOpcode()) { + case RISCV::PseudoAtomicSwap32: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xchg, false, 32, + NextMBBI); + case RISCV::PseudoAtomicSwap64: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xchg, false, 64, + NextMBBI); + case RISCV::PseudoAtomicLoadAdd32: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Add, false, 32, + NextMBBI); + case RISCV::PseudoAtomicLoadAdd64: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Add, false, 64, + NextMBBI); + case RISCV::PseudoAtomicLoadSub32: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Sub, false, 32, + NextMBBI); + case RISCV::PseudoAtomicLoadSub64: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Sub, false, 64, + NextMBBI); + case RISCV::PseudoAtomicLoadAnd32: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::And, false, 32, + NextMBBI); + case RISCV::PseudoAtomicLoadAnd64: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::And, false, 64, + NextMBBI); + case RISCV::PseudoAtomicLoadOr32: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Or, false, 32, NextMBBI); + case RISCV::PseudoAtomicLoadOr64: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Or, false, 64, NextMBBI); + case RISCV::PseudoAtomicLoadXor32: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xor, false, 32, + NextMBBI); + case RISCV::PseudoAtomicLoadXor64: + return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xor, false, 64, + NextMBBI); case RISCV::PseudoAtomicLoadNand32: return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Nand, false, 32, NextMBBI); case RISCV::PseudoAtomicLoadNand64: return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Nand, false, 64, NextMBBI); + case RISCV::PseudoAtomicLoadMin32: + return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Min, false, 32, + NextMBBI); + case RISCV::PseudoAtomicLoadMin64: + return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Min, false, 64, + NextMBBI); + case RISCV::PseudoAtomicLoadMax32: + return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Max, false, 32, + NextMBBI); + case RISCV::PseudoAtomicLoadMax64: + return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Max, false, 64, + NextMBBI); + case RISCV::PseudoAtomicLoadUMin32: + return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMin, false, 32, + NextMBBI); + case RISCV::PseudoAtomicLoadUMin64: + return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMin, false, 64, + NextMBBI); + case RISCV::PseudoAtomicLoadUMax32: + return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMax, false, 32, + NextMBBI); + case RISCV::PseudoAtomicLoadUMax64: + return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMax, false, 64, + NextMBBI); case RISCV::PseudoMaskedAtomicSwap32: return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xchg, true, 32, NextMBBI); @@ -277,6 +335,36 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI, switch (BinOp) { default: llvm_unreachable("Unexpected AtomicRMW BinOp"); + case AtomicRMWInst::Xchg: + BuildMI(LoopMBB, DL, TII->get(RISCV::ADDI), ScratchReg) + .addReg(IncrReg) + .addImm(0); + break; + case AtomicRMWInst::Add: + BuildMI(LoopMBB, DL, TII->get(RISCV::ADD), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; + case AtomicRMWInst::Sub: + BuildMI(LoopMBB, DL, TII->get(RISCV::SUB), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; + case AtomicRMWInst::And: + BuildMI(LoopMBB, DL, TII->get(RISCV::AND), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; + case AtomicRMWInst::Or: + BuildMI(LoopMBB, DL, TII->get(RISCV::OR), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; + case AtomicRMWInst::Xor: + BuildMI(LoopMBB, DL, TII->get(RISCV::XOR), ScratchReg) + .addReg(DestReg) + .addReg(IncrReg); + break; case AtomicRMWInst::Nand: BuildMI(LoopMBB, DL, TII->get(RISCV::AND), ScratchReg) .addReg(DestReg) @@ -433,38 +521,85 @@ static void insertSext(const RISCVInstrInfo *TII, DebugLoc DL, .addReg(ShamtReg); } -bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - AtomicRMWInst::BinOp BinOp, bool IsMasked, int Width, - MachineBasicBlock::iterator &NextMBBI) { - assert(IsMasked == true && - "Should only need to expand masked atomic max/min"); - assert(Width == 32 && "Should never need to expand masked 64-bit operations"); +static void doAtomicMinMaxOpExpansion( + const RISCVInstrInfo *TII, MachineInstr &MI, DebugLoc DL, + MachineBasicBlock *ThisMBB, MachineBasicBlock *LoopHeadMBB, + MachineBasicBlock *LoopIfBodyMBB, MachineBasicBlock *LoopTailMBB, + MachineBasicBlock *DoneMBB, AtomicRMWInst::BinOp BinOp, int Width, + const RISCVSubtarget *STI) { + Register DestReg = MI.getOperand(0).getReg(); + Register ScratchReg = MI.getOperand(1).getReg(); + Register AddrReg = MI.getOperand(2).getReg(); + Register IncrReg = MI.getOperand(3).getReg(); + AtomicOrdering Ordering = + static_cast<AtomicOrdering>(MI.getOperand(4).getImm()); - MachineInstr &MI = *MBBI; - DebugLoc DL = MI.getDebugLoc(); - MachineFunction *MF = MBB.getParent(); - auto LoopHeadMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); - auto LoopIfBodyMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); - auto LoopTailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); - auto DoneMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + // .loophead: + // lr.[w|d] dest, (addr) + // mv scratch, dest + // ifnochangeneeded scratch, incr, .looptail + BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW(Ordering, Width, STI)), DestReg) + .addReg(AddrReg); + BuildMI(LoopHeadMBB, DL, TII->get(RISCV::ADDI), ScratchReg) + .addReg(DestReg) + .addImm(0); + switch (BinOp) { + default: + llvm_unreachable("Unexpected AtomicRMW BinOp"); + case AtomicRMWInst::Max: { + BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGE)) + .addReg(ScratchReg) + .addReg(IncrReg) + .addMBB(LoopTailMBB); + break; + } + case AtomicRMWInst::Min: { + BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGE)) + .addReg(IncrReg) + .addReg(ScratchReg) + .addMBB(LoopTailMBB); + break; + } + case AtomicRMWInst::UMax: + BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGEU)) + .addReg(ScratchReg) + .addReg(IncrReg) + .addMBB(LoopTailMBB); + break; + case AtomicRMWInst::UMin: + BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGEU)) + .addReg(IncrReg) + .addReg(ScratchReg) + .addMBB(LoopTailMBB); + break; + } - // Insert new MBBs. - MF->insert(++MBB.getIterator(), LoopHeadMBB); - MF->insert(++LoopHeadMBB->getIterator(), LoopIfBodyMBB); - MF->insert(++LoopIfBodyMBB->getIterator(), LoopTailMBB); - MF->insert(++LoopTailMBB->getIterator(), DoneMBB); + // .loopifbody: + // mv scratch, incr + BuildMI(LoopIfBodyMBB, DL, TII->get(RISCV::ADDI), ScratchReg) + .addReg(IncrReg) + .addImm(0); - // Set up successors and transfer remaining instructions to DoneMBB. - LoopHeadMBB->addSuccessor(LoopIfBodyMBB); - LoopHeadMBB->addSuccessor(LoopTailMBB); - LoopIfBodyMBB->addSuccessor(LoopTailMBB); - LoopTailMBB->addSuccessor(LoopHeadMBB); - LoopTailMBB->addSuccessor(DoneMBB); - DoneMBB->splice(DoneMBB->end(), &MBB, MI, MBB.end()); - DoneMBB->transferSuccessors(&MBB); - MBB.addSuccessor(LoopHeadMBB); + // .looptail: + // sc.[w|d] scratch, scratch, (addr) + // bnez scratch, loop + BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)), + ScratchReg) + .addReg(ScratchReg) + .addReg(AddrReg); + BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE)) + .addReg(ScratchReg) + .addReg(RISCV::X0) + .addMBB(LoopHeadMBB); +} +static void doMaskedAtomicMinMaxOpExpansion( + const RISCVInstrInfo *TII, MachineInstr &MI, DebugLoc DL, + MachineBasicBlock *ThisMBB, MachineBasicBlock *LoopHeadMBB, + MachineBasicBlock *LoopIfBodyMBB, MachineBasicBlock *LoopTailMBB, + MachineBasicBlock *DoneMBB, AtomicRMWInst::BinOp BinOp, int Width, + const RISCVSubtarget *STI) { + assert(Width == 32 && "Should never need to expand masked 64-bit operations"); Register DestReg = MI.getOperand(0).getReg(); Register Scratch1Reg = MI.getOperand(1).getReg(); Register Scratch2Reg = MI.getOperand(2).getReg(); @@ -541,6 +676,44 @@ bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp( .addReg(Scratch1Reg) .addReg(RISCV::X0) .addMBB(LoopHeadMBB); +} + +bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + AtomicRMWInst::BinOp BinOp, bool IsMasked, int Width, + MachineBasicBlock::iterator &NextMBBI) { + + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + MachineFunction *MF = MBB.getParent(); + auto LoopHeadMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto LoopIfBodyMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto LoopTailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto DoneMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + + // Insert new MBBs. + MF->insert(++MBB.getIterator(), LoopHeadMBB); + MF->insert(++LoopHeadMBB->getIterator(), LoopIfBodyMBB); + MF->insert(++LoopIfBodyMBB->getIterator(), LoopTailMBB); + MF->insert(++LoopTailMBB->getIterator(), DoneMBB); + + // Set up successors and transfer remaining instructions to DoneMBB. + LoopHeadMBB->addSuccessor(LoopIfBodyMBB); + LoopHeadMBB->addSuccessor(LoopTailMBB); + LoopIfBodyMBB->addSuccessor(LoopTailMBB); + LoopTailMBB->addSuccessor(LoopHeadMBB); + LoopTailMBB->addSuccessor(DoneMBB); + DoneMBB->splice(DoneMBB->end(), &MBB, MI, MBB.end()); + DoneMBB->transferSuccessors(&MBB); + MBB.addSuccessor(LoopHeadMBB); + + if (!IsMasked) + doAtomicMinMaxOpExpansion(TII, MI, DL, &MBB, LoopHeadMBB, LoopIfBodyMBB, + LoopTailMBB, DoneMBB, BinOp, Width, STI); + else + doMaskedAtomicMinMaxOpExpansion(TII, MI, DL, &MBB, LoopHeadMBB, + LoopIfBodyMBB, LoopTailMBB, DoneMBB, BinOp, + Width, STI); NextMBBI = MBB.end(); MI.eraseFromParent(); diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 19992e6..9e6b7f0 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -218,6 +218,7 @@ def HasStdExtZaamo : Predicate<"Subtarget->hasStdExtZaamo()">, AssemblerPredicate<(any_of FeatureStdExtZaamo), "'Zaamo' (Atomic Memory Operations)">; +def NoStdExtZaamo : Predicate<"!Subtarget->hasStdExtZaamo()">; def FeatureStdExtZalrsc : RISCVExtension<1, 0, "Load-Reserved/Store-Conditional">; @@ -1334,6 +1335,44 @@ def HasVendorXSfvfnrclipxfqf AssemblerPredicate<(all_of FeatureVendorXSfvfnrclipxfqf), "'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions)">; +// Note: XSfvfbfexp16e depends on either Zvfbfmin _or_ Zvfbfa, which cannot be expressed here in +// TableGen. Instead, we check that in RISCVISAInfo. +def FeatureVendorXSfvfbfexp16e + : RISCVExtension<0, 5, + "SiFive Vector Floating-Point Exponential Function Instruction, BFloat16">; +def HasVendorXSfvfbfexp16e : Predicate<"Subtarget->hasVendorXSfvfbfexp16e()">; + +def FeatureVendorXSfvfexp16e + : RISCVExtension<0, 5, + "SiFive Vector Floating-Point Exponential Function Instruction, Half Precision", + [FeatureStdExtZvfh]>; +def HasVendorXSfvfexp16e : Predicate<"Subtarget->hasVendorXSfvfexp16e()">; + +def FeatureVendorXSfvfexp32e + : RISCVExtension<0, 5, + "SiFive Vector Floating-Point Exponential Function Instruction, Single Precision", + [FeatureStdExtZve32f]>; +def HasVendorXSfvfexp32e : Predicate<"Subtarget->hasVendorXSfvfexp32e()">; + +def HasVendorXSfvfexpAnyFloat : Predicate<"Subtarget->hasVendorXSfvfexp16e() || Subtarget->hasVendorXSfvfexp32e()">; +def HasVendorXSfvfexpAny : Predicate<"Subtarget->hasVendorXSfvfbfexp16e() || Subtarget->hasVendorXSfvfexp16e() || Subtarget->hasVendorXSfvfexp32e()">, + AssemblerPredicate<(any_of FeatureVendorXSfvfbfexp16e, FeatureVendorXSfvfexp16e, FeatureVendorXSfvfexp32e), + "'Xsfvfbfexp16e', 'Xsfvfexp16e', or 'Xsfvfexp32e' (SiFive Vector Floating-Point Exponential Function Instruction)">; + +def FeatureVendorXSfvfexpa + : RISCVExtension<0, 2, + "SiFive Vector Floating-Point Exponential Approximation Instruction", + [FeatureStdExtZve32f]>; +def HasVendorXSfvfexpa : Predicate<"Subtarget->hasVendorXSfvfexpa()">, + AssemblerPredicate<(all_of FeatureVendorXSfvfexpa), + "'Xsfvfexpa' (SiFive Vector Floating-Point Exponential Approximation Instruction)">; + +def FeatureVendorXSfvfexpa64e + : RISCVExtension<0, 2, + "SiFive Vector Floating-Point Exponential Approximation Instruction with Double-Precision", + [FeatureVendorXSfvfexpa, FeatureStdExtZve64d]>; +def HasVendorXSfvfexpa64e : Predicate<"Subtarget->hasVendorXSfvfexpa64e()">; + def FeatureVendorXSiFivecdiscarddlone : RISCVExtension<1, 0, "SiFive sf.cdiscard.d.l1 Instruction", []>; @@ -1864,7 +1903,7 @@ def FeatureForcedAtomics : SubtargetFeature< "forced-atomics", "HasForcedAtomics", "true", "Assume that lock-free native-width atomics are available">; def HasAtomicLdSt - : Predicate<"Subtarget->hasStdExtA() || Subtarget->hasForcedAtomics()">; + : Predicate<"Subtarget->hasStdExtZalrsc() || Subtarget->hasForcedAtomics()">; def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals", "AllowTaggedGlobals", diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 0a53ba9..26fe9ed 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -688,7 +688,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, else if (Subtarget.hasStdExtZicbop()) setOperationAction(ISD::PREFETCH, MVT::Other, Legal); - if (Subtarget.hasStdExtA()) { + if (Subtarget.hasStdExtZalrsc()) { setMaxAtomicSizeInBitsSupported(Subtarget.getXLen()); if (Subtarget.hasStdExtZabha() && Subtarget.hasStdExtZacas()) setMinCmpXchgSizeInBits(8); @@ -1558,7 +1558,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, } } - if (Subtarget.hasStdExtA()) + if (Subtarget.hasStdExtZaamo()) setOperationAction(ISD::ATOMIC_LOAD_SUB, XLenVT, Expand); if (Subtarget.hasForcedAtomics()) { @@ -21875,7 +21875,7 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode( // result is then sign extended to XLEN. With +A, the minimum width is // 32 for both 64 and 32. assert(getMinCmpXchgSizeInBits() == 32); - assert(Subtarget.hasStdExtA()); + assert(Subtarget.hasStdExtZalrsc()); return Op.getValueSizeInBits() - 31; } break; @@ -24044,18 +24044,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, } } - std::pair<Register, const TargetRegisterClass *> Res = - TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); - - // If we picked one of the Zfinx register classes, remap it to the GPR class. - // FIXME: When Zfinx is supported in CodeGen this will need to take the - // Subtarget into account. - if (Res.second == &RISCV::GPRF16RegClass || - Res.second == &RISCV::GPRF32RegClass || - Res.second == &RISCV::GPRPairRegClass) - return std::make_pair(Res.first, &RISCV::GPRRegClass); - - return Res; + return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } InlineAsm::ConstraintCode @@ -24482,6 +24471,25 @@ ISD::NodeType RISCVTargetLowering::getExtendForAtomicCmpSwapArg() const { return Subtarget.hasStdExtZacas() ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND; } +ISD::NodeType RISCVTargetLowering::getExtendForAtomicRMWArg(unsigned Op) const { + // Zaamo will use amo<op>.w which does not require extension. + if (Subtarget.hasStdExtZaamo() || Subtarget.hasForcedAtomics()) + return ISD::ANY_EXTEND; + + // Zalrsc pseudo expansions with comparison require sign-extension. + assert(Subtarget.hasStdExtZalrsc()); + switch (Op) { + case ISD::ATOMIC_LOAD_MIN: + case ISD::ATOMIC_LOAD_MAX: + case ISD::ATOMIC_LOAD_UMIN: + case ISD::ATOMIC_LOAD_UMAX: + return ISD::SIGN_EXTEND; + default: + break; + } + return ISD::ANY_EXTEND; +} + Register RISCVTargetLowering::getExceptionPointerRegister( const Constant *PersonalityFn) const { return RISCV::X10; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 3f81ed7..9e3e2a9 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -245,6 +245,7 @@ public: } ISD::NodeType getExtendForAtomicCmpSwapArg() const override; + ISD::NodeType getExtendForAtomicRMWArg(unsigned Op) const override; bool shouldTransformSignedTruncationCheck(EVT XVT, unsigned KeptBits) const override; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 12f776b..912b82d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -1689,42 +1689,44 @@ bool RISCVInstrInfo::isBranchOffsetInRange(unsigned BranchOp, // instruction opcode. Otherwise, return RISCV::INSTRUCTION_LIST_END. // TODO: Support more operations. unsigned getPredicatedOpcode(unsigned Opcode) { + // clang-format off switch (Opcode) { - case RISCV::ADD: return RISCV::PseudoCCADD; break; - case RISCV::SUB: return RISCV::PseudoCCSUB; break; - case RISCV::SLL: return RISCV::PseudoCCSLL; break; - case RISCV::SRL: return RISCV::PseudoCCSRL; break; - case RISCV::SRA: return RISCV::PseudoCCSRA; break; - case RISCV::AND: return RISCV::PseudoCCAND; break; - case RISCV::OR: return RISCV::PseudoCCOR; break; - case RISCV::XOR: return RISCV::PseudoCCXOR; break; - - case RISCV::ADDI: return RISCV::PseudoCCADDI; break; - case RISCV::SLLI: return RISCV::PseudoCCSLLI; break; - case RISCV::SRLI: return RISCV::PseudoCCSRLI; break; - case RISCV::SRAI: return RISCV::PseudoCCSRAI; break; - case RISCV::ANDI: return RISCV::PseudoCCANDI; break; - case RISCV::ORI: return RISCV::PseudoCCORI; break; - case RISCV::XORI: return RISCV::PseudoCCXORI; break; - - case RISCV::ADDW: return RISCV::PseudoCCADDW; break; - case RISCV::SUBW: return RISCV::PseudoCCSUBW; break; - case RISCV::SLLW: return RISCV::PseudoCCSLLW; break; - case RISCV::SRLW: return RISCV::PseudoCCSRLW; break; - case RISCV::SRAW: return RISCV::PseudoCCSRAW; break; - - case RISCV::ADDIW: return RISCV::PseudoCCADDIW; break; - case RISCV::SLLIW: return RISCV::PseudoCCSLLIW; break; - case RISCV::SRLIW: return RISCV::PseudoCCSRLIW; break; - case RISCV::SRAIW: return RISCV::PseudoCCSRAIW; break; - - case RISCV::ANDN: return RISCV::PseudoCCANDN; break; - case RISCV::ORN: return RISCV::PseudoCCORN; break; - case RISCV::XNOR: return RISCV::PseudoCCXNOR; break; - - case RISCV::NDS_BFOS: return RISCV::PseudoCCNDS_BFOS; break; - case RISCV::NDS_BFOZ: return RISCV::PseudoCCNDS_BFOZ; break; + case RISCV::ADD: return RISCV::PseudoCCADD; + case RISCV::SUB: return RISCV::PseudoCCSUB; + case RISCV::SLL: return RISCV::PseudoCCSLL; + case RISCV::SRL: return RISCV::PseudoCCSRL; + case RISCV::SRA: return RISCV::PseudoCCSRA; + case RISCV::AND: return RISCV::PseudoCCAND; + case RISCV::OR: return RISCV::PseudoCCOR; + case RISCV::XOR: return RISCV::PseudoCCXOR; + + case RISCV::ADDI: return RISCV::PseudoCCADDI; + case RISCV::SLLI: return RISCV::PseudoCCSLLI; + case RISCV::SRLI: return RISCV::PseudoCCSRLI; + case RISCV::SRAI: return RISCV::PseudoCCSRAI; + case RISCV::ANDI: return RISCV::PseudoCCANDI; + case RISCV::ORI: return RISCV::PseudoCCORI; + case RISCV::XORI: return RISCV::PseudoCCXORI; + + case RISCV::ADDW: return RISCV::PseudoCCADDW; + case RISCV::SUBW: return RISCV::PseudoCCSUBW; + case RISCV::SLLW: return RISCV::PseudoCCSLLW; + case RISCV::SRLW: return RISCV::PseudoCCSRLW; + case RISCV::SRAW: return RISCV::PseudoCCSRAW; + + case RISCV::ADDIW: return RISCV::PseudoCCADDIW; + case RISCV::SLLIW: return RISCV::PseudoCCSLLIW; + case RISCV::SRLIW: return RISCV::PseudoCCSRLIW; + case RISCV::SRAIW: return RISCV::PseudoCCSRAIW; + + case RISCV::ANDN: return RISCV::PseudoCCANDN; + case RISCV::ORN: return RISCV::PseudoCCORN; + case RISCV::XNOR: return RISCV::PseudoCCXNOR; + + case RISCV::NDS_BFOS: return RISCV::PseudoCCNDS_BFOS; + case RISCV::NDS_BFOZ: return RISCV::PseudoCCNDS_BFOZ; } + // clang-format on return RISCV::INSTRUCTION_LIST_END; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td index 571d72f..5c81a09 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td @@ -158,9 +158,9 @@ class seq_cst_store<PatFrag base> } } // IsAtomic = 1 -// Atomic load/store are available under both +a and +force-atomics. -// Fences will be inserted for atomic load/stores according to the logic in -// RISCVTargetLowering::{emitLeadingFence,emitTrailingFence}. +// Atomic load/store are available under +zalrsc (thus also +a) and +// +force-atomics. Fences will be inserted for atomic load/stores according to +// the logic in RISCVTargetLowering::{emitLeadingFence,emitTrailingFence}. // The normal loads/stores are relaxed (unordered) loads/stores that don't have // any ordering. This is necessary because AtomicExpandPass has added fences to // atomic load/stores and changed them to unordered ones. @@ -308,7 +308,65 @@ class PseudoMaskedAMOMinMaxPat<Intrinsic intrin, Pseudo AMOInst> (AMOInst GPR:$addr, GPR:$incr, GPR:$mask, GPR:$shiftamt, timm:$ordering)>; -let Predicates = [HasStdExtA] in { +let Predicates = [HasStdExtZalrsc, NoStdExtZaamo] in { + +let Size = 16 in { +def PseudoAtomicSwap32 : PseudoAMO; +def PseudoAtomicLoadAdd32 : PseudoAMO; +def PseudoAtomicLoadSub32 : PseudoAMO; +def PseudoAtomicLoadAnd32 : PseudoAMO; +def PseudoAtomicLoadOr32 : PseudoAMO; +def PseudoAtomicLoadXor32 : PseudoAMO; +} // Size = 16 +let Size = 24 in { +def PseudoAtomicLoadMax32 : PseudoAMO; +def PseudoAtomicLoadMin32 : PseudoAMO; +def PseudoAtomicLoadUMax32 : PseudoAMO; +def PseudoAtomicLoadUMin32 : PseudoAMO; +} // Size = 24 + +defm : PseudoAMOPat<"atomic_swap_i32", PseudoAtomicSwap32>; +defm : PseudoAMOPat<"atomic_load_add_i32", PseudoAtomicLoadAdd32>; +defm : PseudoAMOPat<"atomic_load_sub_i32", PseudoAtomicLoadSub32>; +defm : PseudoAMOPat<"atomic_load_and_i32", PseudoAtomicLoadAnd32>; +defm : PseudoAMOPat<"atomic_load_or_i32", PseudoAtomicLoadOr32>; +defm : PseudoAMOPat<"atomic_load_xor_i32", PseudoAtomicLoadXor32>; +defm : PseudoAMOPat<"atomic_load_max_i32", PseudoAtomicLoadMax32>; +defm : PseudoAMOPat<"atomic_load_min_i32", PseudoAtomicLoadMin32>; +defm : PseudoAMOPat<"atomic_load_umax_i32", PseudoAtomicLoadUMax32>; +defm : PseudoAMOPat<"atomic_load_umin_i32", PseudoAtomicLoadUMin32>; +} // Predicates = [HasStdExtZalrsc, NoStdExtZaamo] + +let Predicates = [HasStdExtZalrsc, NoStdExtZaamo, IsRV64] in { + +let Size = 16 in { +def PseudoAtomicSwap64 : PseudoAMO; +def PseudoAtomicLoadAdd64 : PseudoAMO; +def PseudoAtomicLoadSub64 : PseudoAMO; +def PseudoAtomicLoadAnd64 : PseudoAMO; +def PseudoAtomicLoadOr64 : PseudoAMO; +def PseudoAtomicLoadXor64 : PseudoAMO; +} // Size = 16 +let Size = 24 in { +def PseudoAtomicLoadMax64 : PseudoAMO; +def PseudoAtomicLoadMin64 : PseudoAMO; +def PseudoAtomicLoadUMax64 : PseudoAMO; +def PseudoAtomicLoadUMin64 : PseudoAMO; +} // Size = 24 + +defm : PseudoAMOPat<"atomic_swap_i64", PseudoAtomicSwap64, i64>; +defm : PseudoAMOPat<"atomic_load_add_i64", PseudoAtomicLoadAdd64, i64>; +defm : PseudoAMOPat<"atomic_load_sub_i64", PseudoAtomicLoadSub64, i64>; +defm : PseudoAMOPat<"atomic_load_and_i64", PseudoAtomicLoadAnd64, i64>; +defm : PseudoAMOPat<"atomic_load_or_i64", PseudoAtomicLoadOr64, i64>; +defm : PseudoAMOPat<"atomic_load_xor_i64", PseudoAtomicLoadXor64, i64>; +defm : PseudoAMOPat<"atomic_load_max_i64", PseudoAtomicLoadMax64, i64>; +defm : PseudoAMOPat<"atomic_load_min_i64", PseudoAtomicLoadMin64, i64>; +defm : PseudoAMOPat<"atomic_load_umax_i64", PseudoAtomicLoadUMax64, i64>; +defm : PseudoAMOPat<"atomic_load_umin_i64", PseudoAtomicLoadUMin64, i64>; +} // Predicates = [HasStdExtZalrsc, NoStdExtZaamo, IsRV64] + +let Predicates = [HasStdExtZalrsc] in { let Size = 20 in def PseudoAtomicLoadNand32 : PseudoAMO; @@ -347,14 +405,14 @@ def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_umax, PseudoMaskedAtomicLoadUMax32>; def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_umin, PseudoMaskedAtomicLoadUMin32>; -} // Predicates = [HasStdExtA] +} // Predicates = [HasStdExtZalrsc] -let Predicates = [HasStdExtA, IsRV64] in { +let Predicates = [HasStdExtZalrsc, IsRV64] in { let Size = 20 in def PseudoAtomicLoadNand64 : PseudoAMO; defm : PseudoAMOPat<"atomic_load_nand_i64", PseudoAtomicLoadNand64, i64>; -} // Predicates = [HasStdExtA, IsRV64] +} // Predicates = [HasStdExtZalrsc, IsRV64] /// Compare and exchange @@ -385,17 +443,17 @@ multiclass PseudoCmpXchgPat<string Op, Pseudo CmpXchgInst, (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 7)>; } -let Predicates = [HasStdExtA, NoStdExtZacas] in { +let Predicates = [HasStdExtZalrsc, NoStdExtZacas] in { def PseudoCmpXchg32 : PseudoCmpXchg; defm : PseudoCmpXchgPat<"atomic_cmp_swap_i32", PseudoCmpXchg32>; } -let Predicates = [HasStdExtA, NoStdExtZacas, IsRV64] in { +let Predicates = [HasStdExtZalrsc, NoStdExtZacas, IsRV64] in { def PseudoCmpXchg64 : PseudoCmpXchg; defm : PseudoCmpXchgPat<"atomic_cmp_swap_i64", PseudoCmpXchg64, i64>; } -let Predicates = [HasStdExtA] in { +let Predicates = [HasStdExtZalrsc] in { def PseudoMaskedCmpXchg32 : Pseudo<(outs GPR:$res, GPR:$scratch), (ins GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, @@ -412,4 +470,4 @@ def : Pat<(XLenVT (int_riscv_masked_cmpxchg (XLenVT GPR:$mask), (XLenVT timm:$ordering))), (PseudoMaskedCmpXchg32 GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering)>; -} // Predicates = [HasStdExtA] +} // Predicates = [HasStdExtZalrsc] diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td index 6a4119a..4104abd 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td @@ -217,6 +217,14 @@ let Predicates = [HasVendorXSfvcp], mayLoad = 0, mayStore = 0, defm FVW : CustomSiFiveVCIX<"fvw", VCIX_XVW, VR, VR, FPR32>, Sched<[]>; } +let Predicates = [HasVendorXSfvfexpAny], DecoderNamespace = "XSfvector" in { + def SF_VFEXP_V : VALUVs2<0b010011, 0b00111, OPFVV, "sf.vfexp.v">; +} + +let Predicates = [HasVendorXSfvfexpa], DecoderNamespace = "XSfvector" in { + def SF_VFEXPA_V : VALUVs2<0b010011, 0b00110, OPFVV, "sf.vfexpa.v">; +} + let Predicates = [HasVendorXSfvqmaccdod], DecoderNamespace = "XSfvector", DestEEW = EEWSEWx4, RVVConstraint=VS2Constraint in { def SF_VQMACCU_2x8x2 : CustomSiFiveVMACC<0b101100, OPMVV, "sf.vqmaccu.2x8x2">; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 5591d9f..021353a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -355,9 +355,9 @@ private: SPIRVType *widenTypeToVec4(const SPIRVType *Type, MachineInstr &I) const; bool extractSubvector(Register &ResVReg, const SPIRVType *ResType, Register &ReadReg, MachineInstr &InsertionPoint) const; - bool generateImageRead(Register &ResVReg, const SPIRVType *ResType, - Register ImageReg, Register IdxReg, DebugLoc Loc, - MachineInstr &Pos) const; + bool generateImageReadOrFetch(Register &ResVReg, const SPIRVType *ResType, + Register ImageReg, Register IdxReg, + DebugLoc Loc, MachineInstr &Pos) const; bool BuildCOPY(Register DestReg, Register SrcReg, MachineInstr &I) const; bool loadVec3BuiltinInputID(SPIRV::BuiltIn::BuiltIn BuiltInValue, Register ResVReg, const SPIRVType *ResType, @@ -1321,8 +1321,8 @@ bool SPIRVInstructionSelector::selectLoad(Register ResVReg, } Register IdxReg = IntPtrDef->getOperand(3).getReg(); - return generateImageRead(ResVReg, ResType, NewHandleReg, IdxReg, - I.getDebugLoc(), I); + return generateImageReadOrFetch(ResVReg, ResType, NewHandleReg, IdxReg, + I.getDebugLoc(), I); } } @@ -3639,27 +3639,33 @@ bool SPIRVInstructionSelector::selectReadImageIntrinsic( DebugLoc Loc = I.getDebugLoc(); MachineInstr &Pos = I; - return generateImageRead(ResVReg, ResType, NewImageReg, IdxReg, Loc, Pos); + return generateImageReadOrFetch(ResVReg, ResType, NewImageReg, IdxReg, Loc, + Pos); } -bool SPIRVInstructionSelector::generateImageRead(Register &ResVReg, - const SPIRVType *ResType, - Register ImageReg, - Register IdxReg, DebugLoc Loc, - MachineInstr &Pos) const { +bool SPIRVInstructionSelector::generateImageReadOrFetch( + Register &ResVReg, const SPIRVType *ResType, Register ImageReg, + Register IdxReg, DebugLoc Loc, MachineInstr &Pos) const { SPIRVType *ImageType = GR.getSPIRVTypeForVReg(ImageReg); assert(ImageType && ImageType->getOpcode() == SPIRV::OpTypeImage && "ImageReg is not an image type."); + bool IsSignedInteger = sampledTypeIsSignedInteger(GR.getTypeForSPIRVType(ImageType)); + // Check if the "sampled" operand of the image type is 1. + // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpImageFetch + auto SampledOp = ImageType->getOperand(6); + bool IsFetch = (SampledOp.getImm() == 1); uint64_t ResultSize = GR.getScalarOrVectorComponentCount(ResType); if (ResultSize == 4) { - auto BMI = BuildMI(*Pos.getParent(), Pos, Loc, TII.get(SPIRV::OpImageRead)) - .addDef(ResVReg) - .addUse(GR.getSPIRVTypeID(ResType)) - .addUse(ImageReg) - .addUse(IdxReg); + auto BMI = + BuildMI(*Pos.getParent(), Pos, Loc, + TII.get(IsFetch ? SPIRV::OpImageFetch : SPIRV::OpImageRead)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(ImageReg) + .addUse(IdxReg); if (IsSignedInteger) BMI.addImm(0x1000); // SignExtend @@ -3668,11 +3674,13 @@ bool SPIRVInstructionSelector::generateImageRead(Register &ResVReg, SPIRVType *ReadType = widenTypeToVec4(ResType, Pos); Register ReadReg = MRI->createVirtualRegister(GR.getRegClass(ReadType)); - auto BMI = BuildMI(*Pos.getParent(), Pos, Loc, TII.get(SPIRV::OpImageRead)) - .addDef(ReadReg) - .addUse(GR.getSPIRVTypeID(ReadType)) - .addUse(ImageReg) - .addUse(IdxReg); + auto BMI = + BuildMI(*Pos.getParent(), Pos, Loc, + TII.get(IsFetch ? SPIRV::OpImageFetch : SPIRV::OpImageRead)) + .addDef(ReadReg) + .addUse(GR.getSPIRVTypeID(ReadType)) + .addUse(ImageReg) + .addUse(IdxReg); if (IsSignedInteger) BMI.addImm(0x1000); // SignExtend bool Succeed = BMI.constrainAllUses(TII, TRI, RBI); diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp index cf85691..9bda8a4 100644 --- a/llvm/lib/Target/TargetMachine.cpp +++ b/llvm/lib/Target/TargetMachine.cpp @@ -158,7 +158,6 @@ void TargetMachine::resetTargetOptions(const Function &F) const { Options.X = F.getFnAttribute(Y).getValueAsBool(); \ } while (0) - RESET_OPTION(UnsafeFPMath, "unsafe-fp-math"); RESET_OPTION(NoInfsFPMath, "no-infs-fp-math"); RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math"); RESET_OPTION(NoSignedZerosFPMath, "no-signed-zeros-fp-math"); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index f973949..7ec463b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -183,6 +183,11 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( for (auto T : {MVT::i32, MVT::i64}) setOperationAction(Op, T, Custom); + if (Subtarget->hasRelaxedSIMD()) { + setOperationAction( + {ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXNUM, ISD::FMAXIMUMNUM}, + {MVT::v4f32, MVT::v2f64}, Legal); + } // SIMD-specific configuration if (Subtarget->hasSIMD128()) { diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 7840620..f0ac26b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1742,6 +1742,23 @@ defm SIMD_RELAXED_FMIN : defm SIMD_RELAXED_FMAX : RelaxedBinary<F64x2, int_wasm_relaxed_max, "relaxed_max", 0x110>; +let Predicates = [HasRelaxedSIMD] in { + foreach vec = [F32x4, F64x2] in { + defvar relaxed_min = !cast<NI>("SIMD_RELAXED_FMIN_"#vec); + defvar relaxed_max = !cast<NI>("SIMD_RELAXED_FMAX_"#vec); + + // Transform standard fminimum/fmaximum to relaxed versions + def : Pat<(vec.vt (fminnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))), + (relaxed_min V128:$lhs, V128:$rhs)>; + def : Pat<(vec.vt (fminimumnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))), + (relaxed_min V128:$lhs, V128:$rhs)>; + def : Pat<(vec.vt (fmaxnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))), + (relaxed_max V128:$lhs, V128:$rhs)>; + def : Pat<(vec.vt (fmaximumnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))), + (relaxed_max V128:$lhs, V128:$rhs)>; + } +} + //===----------------------------------------------------------------------===// // Relaxed rounding q15 multiplication //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b5f8ee5..d49f25a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -20558,7 +20558,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); // NOTE: By using fsub of a positive constant instead of fadd of a negative - // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is + // constant, we avoid reassociation in MachineCombiner when reassoc is // enabled. See PR24512. SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High); // TODO: Are there any fast-math-flags to propagate here? @@ -29516,11 +29516,8 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX)) return MinMax; - if (DAG.isKnownNeverNaN(NewX)) - NewX = NewY; - - SDValue IsNaN = - DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO); + SDValue NaNSrc = IsNum ? MinMax : NewX; + SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NaNSrc, NaNSrc, ISD::SETUO); return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax); } diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 83bd6ac..1b748b7 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -5519,7 +5519,7 @@ defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86any_fmin, X86fmins, X86fminSAEs, defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86any_fmax, X86fmaxs, X86fmaxSAEs, SchedWriteFCmpSizes, 0>; -// MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use +// MIN/MAX nodes are commutable under (nnan + ninf). In this case we use // X86fminc and X86fmaxc instead of X86fmin and X86fmax multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, SDNode OpNode, diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td index cc30054..ac4d31d 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver4.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td @@ -15,7 +15,7 @@ //===----------------------------------------------------------------------===// def Znver4Model : SchedMachineModel { - // AMD SOG Zen4, 2.9.6 Dispatch + // AMD SOG Zen4, 2.9.8 Dispatch // The processor may dispatch up to 6 macro ops per cycle // into the execution engine. let IssueWidth = 6; @@ -46,8 +46,9 @@ def Znver4Model : SchedMachineModel { int VecLoadLatency = 7; // Latency of a simple store operation. int StoreLatency = 1; - // FIXME: - let HighLatency = 25; // FIXME: any better choice? + // Mean and median value for all instructions with latencies >6 + // Source: Zen4 Instruction Latencies spreadsheet (included with SOG) + let HighLatency = 13; // AMD SOG Zen4, 2.8 Optimizing Branching // The branch misprediction penalty is in the range from 11 to 18 cycles, // <...>. The common case penalty is 13 cycles. @@ -612,6 +613,7 @@ def Zn4WriteLEA : SchedWriteVariant<[ def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>; +// values from uops.info def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> { let Latency = 2; // FIXME: not from llvm-exegesis let ReleaseAtCycles = [4]; @@ -659,15 +661,15 @@ def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>; def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> { let Latency = 3; // FIXME: not from llvm-exegesis - let ReleaseAtCycles = [24]; - let NumMicroOps = 19; + let ReleaseAtCycles = [20]; + let NumMicroOps = 15; } def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>; def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> { - let Latency = 4; // FIXME: not from llvm-exegesis - let ReleaseAtCycles = [59]; - let NumMicroOps = 28; + let Latency = 2; // FIXME: not from llvm-exegesis + let ReleaseAtCycles = [40]; + let NumMicroOps = 26; } def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>; @@ -681,7 +683,7 @@ def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16a def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> { let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis let ReleaseAtCycles = [1, 1, 2]; - let NumMicroOps = 5; + let NumMicroOps = 2; } def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>; @@ -693,19 +695,17 @@ def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>; // Integer division. -// FIXME: uops for 8-bit division measures as 2. for others it's a guess. -// FIXME: latency for 8-bit division measures as 10. for others it's a guess. -defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 10, [10], 2>; -defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 11, [11], 2>; -defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 13, [13], 2>; -defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 17, [17], 2>; -defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 10, [10], 2>; -defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 11, [11], 2>; -defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 13, [13], 2>; -defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 17, [17], 2>; - -defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward. -defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse. +defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 9, [9], 2>; +defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 10, [10], 2>; +defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 12, [12], 2>; +defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 18, [18], 2>; +defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 9, [9], 2>; +defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 10, [10], 2>; +defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 12, [12], 2>; +defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 18, [18], 2>; + +defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 1, /*LoadUOps=*/1>; // Bit scan forward. +defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 1, /*LoadUOps=*/1>; // Bit scan reverse. defm : Zn4WriteResIntPair<WritePOPCNT, [Zn4ALU0123], 1, [1], 1>; // Bit population count. @@ -725,12 +725,12 @@ def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> { } def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>; -defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 2, [1], 2>; // Trailing zero count. +defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 1, [1], 1>; // Trailing zero count. def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> { - let Latency = 2; - let ReleaseAtCycles = [4]; - let NumMicroOps = 2; + let Latency = 1; + let ReleaseAtCycles = [1]; + let NumMicroOps = 1; } def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>; @@ -1109,15 +1109,31 @@ def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> { } def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>; -def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> { - // TODO: All align instructions are expected to be of 4 cycle latency - let Latency = 4; +// 128-bit VALIGN +def Zn4WriteXMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> { + let Latency = 2; let ReleaseAtCycles = [1]; let NumMicroOps = 1; } -def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri, - VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri) - >; + +// 256-bit VALIGN +def Zn4WriteYMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> { + let Latency = 3; + let ReleaseAtCycles = [1]; + let NumMicroOps = 1; +} + +// 512-bit VALIGN +def Zn4WriteZMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> { + let Latency = 4; + let ReleaseAtCycles = [2]; + let NumMicroOps = 1; +} + +def : InstRW<[Zn4WriteXMMVecALU2Slow], (instrs VALIGNDZrri, VALIGNQZrri)>; +def : InstRW<[Zn4WriteYMMVecALU2Slow], (instrs VALIGNDZ128rri, VALIGNQZ128rri)>; +def : InstRW<[Zn4WriteZMMVecALU2Slow], (instrs VALIGNDZ256rri, VALIGNQZ256rri)>; + defm : Zn4WriteResYMMPair<WriteVecALUY, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM). def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> { @@ -1326,9 +1342,9 @@ def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>; // Strings instructions. // Packed Compare Implicit Length Strings, Return Mask -defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>; +defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 7, [8], 3, /*LoadUOps=*/1>; // Packed Compare Explicit Length Strings, Return Mask -defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>; +defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 7, [12], 7, /*LoadUOps=*/5>; // Packed Compare Implicit Length Strings, Return Index defm : Zn4WriteResXMMPair<WritePCmpIStrI, [Zn4FPVAdd0123], 2, [8], 4>; // Packed Compare Explicit Length Strings, Return Index @@ -1340,7 +1356,7 @@ defm : Zn4WriteResXMMPair<WriteAESIMC, [Zn4FPAES01], 4, [1], 1>; // InvMixColumn defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Generation. // Carry-less multiplication instructions. -defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4], 4>; +defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [3], 4>; // EMMS/FEMMS defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis @@ -1386,44 +1402,44 @@ def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rmi)>; def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> { - let Latency = 7; + let Latency = 4; let ReleaseAtCycles = [1]; - let NumMicroOps = 2; + let NumMicroOps = 1; } def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>; def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMPSYrr.Latency); - let ReleaseAtCycles = [1, 1, 2]; - let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1); + let ReleaseAtCycles = [1, 1, 1]; + let NumMicroOps = 1; } def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>; def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> { - let Latency = 6; + let Latency = 4; let ReleaseAtCycles = [1]; - let NumMicroOps = 2; + let NumMicroOps = 1; } def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>; def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMYri.Latency); - let ReleaseAtCycles = [1, 1, 2]; - let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1); + let ReleaseAtCycles = [1, 1, 1]; + let NumMicroOps = 1; } def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>; def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> { - let Latency = 5; + let Latency = 4; let ReleaseAtCycles = [1]; - let NumMicroOps = 2; + let NumMicroOps = 1; } def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>; def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMDYrr.Latency); - let ReleaseAtCycles = [1, 1, 2]; - let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0); + let ReleaseAtCycles = [1, 1, 1]; + let NumMicroOps = 1; } def : InstRW<[Zn4WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>; |