From 8520061281b0475bf4767107ddc94cf13335db48 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 18 Jun 2024 18:34:34 +0200 Subject: AMDGPU: Support local atomicrmw fmin/fmax for float/double (#95590) This has always been supported. Somehow, we ended up with 2 copies of clang builtins for this case, and the newer one erroneously requires gfx8-insts. --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 7 +- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 4 +- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 2 - llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 2 - .../Target/AMDGPU/AMDGPUInstructionSelector.cpp | 4 +- llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 2 + llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 9 +- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 6 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 51 +- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 19 +- llvm/lib/Target/AMDGPU/SIInstructions.td | 5 - llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 4 +- .../UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir | 9 +- llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll | 1667 +++----------------- llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll | 1667 +++----------------- .../AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll | 32 +- .../AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll | 32 +- 17 files changed, 479 insertions(+), 3043 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index d81c188..537d3a4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -271,11 +271,8 @@ def : GINodeEquiv; // FIXME: Check MMO is atomic def : GINodeEquiv; def : GINodeEquiv; -def : GINodeEquiv; -def : GINodeEquiv; -def : GINodeEquiv; -def : GINodeEquiv; - +def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b50c0cc..d60c62a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -502,9 +502,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { // isa almost works but is slightly too permissive for some DS // intrinsics. - if (Opc == ISD::LOAD || Opc == ISD::STORE || isa(N) || - Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || - Opc == AMDGPUISD::ATOMIC_LOAD_FMAX) { + if (Opc == ISD::LOAD || Opc == ISD::STORE || isa(N)) { N = glueCopyToM0LDSInit(N); SelectCode(N); return; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 519e623..522b3a3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -5524,8 +5524,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(DS_ORDERED_COUNT) NODE_NAME_CASE(ATOMIC_CMP_SWAP) - NODE_NAME_CASE(ATOMIC_LOAD_FMIN) - NODE_NAME_CASE(ATOMIC_LOAD_FMAX) NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_UBYTE) NODE_NAME_CASE(BUFFER_LOAD_USHORT) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 206bb46..37572af 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -575,8 +575,6 @@ enum NodeType : unsigned { TBUFFER_LOAD_FORMAT_D16, DS_ORDERED_COUNT, ATOMIC_CMP_SWAP, - ATOMIC_LOAD_FMIN, - ATOMIC_LOAD_FMAX, BUFFER_LOAD, BUFFER_LOAD_UBYTE, BUFFER_LOAD_USHORT, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index ae3f2b8..03e2d62 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3620,8 +3620,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_ATOMICRMW_UINC_WRAP: case TargetOpcode::G_ATOMICRMW_UDEC_WRAP: case TargetOpcode::G_ATOMICRMW_FADD: - case AMDGPU::G_AMDGPU_ATOMIC_FMIN: - case AMDGPU::G_AMDGPU_ATOMIC_FMAX: + case TargetOpcode::G_ATOMICRMW_FMIN: + case TargetOpcode::G_ATOMICRMW_FMAX: return selectG_LOAD_STORE_ATOMICRMW(I); case TargetOpcode::G_SELECT: return selectG_SELECT(I); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index ebc6402..21f541d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -685,6 +685,8 @@ defm atomic_load_umax : binary_atomic_op_all_as; defm atomic_load_umin : binary_atomic_op_all_as; defm atomic_load_xor : binary_atomic_op_all_as; defm atomic_load_fadd : binary_atomic_op_fp_all_as; +defm atomic_load_fmin : binary_atomic_op_fp_all_as; +defm atomic_load_fmax : binary_atomic_op_fp_all_as; defm atomic_load_uinc_wrap : binary_atomic_op_all_as; defm atomic_load_udec_wrap : binary_atomic_op_all_as; defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 0c7b196..4ff945e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -283,7 +283,9 @@ static const LLT S1 = LLT::scalar(1); static const LLT S8 = LLT::scalar(8); static const LLT S16 = LLT::scalar(16); static const LLT S32 = LLT::scalar(32); +static const LLT F32 = LLT::float32(); static const LLT S64 = LLT::scalar(64); +static const LLT F64 = LLT::float64(); static const LLT S96 = LLT::scalar(96); static const LLT S128 = LLT::scalar(128); static const LLT S160 = LLT::scalar(160); @@ -1648,6 +1650,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.hasFlatAtomicFaddF32Inst()) Atomic.legalFor({{S32, FlatPtr}}); + getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX}) + .legalFor({{F32, LocalPtr}, {F64, LocalPtr}}); + if (ST.hasGFX90AInsts()) { // These are legal with some caveats, and should have undergone expansion in // the IR in most situations @@ -5401,9 +5406,9 @@ static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { case Intrinsic::amdgcn_ds_fadd: return AMDGPU::G_ATOMICRMW_FADD; case Intrinsic::amdgcn_ds_fmin: - return AMDGPU::G_AMDGPU_ATOMIC_FMIN; + return AMDGPU::G_ATOMICRMW_FMIN; case Intrinsic::amdgcn_ds_fmax: - return AMDGPU::G_AMDGPU_ATOMIC_FMAX; + return AMDGPU::G_ATOMICRMW_FMAX; default: llvm_unreachable("not a DS FP intrinsic"); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 313d53a..0510a1d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5219,11 +5219,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_ATOMICRMW_UMAX: case AMDGPU::G_ATOMICRMW_UMIN: case AMDGPU::G_ATOMICRMW_FADD: + case AMDGPU::G_ATOMICRMW_FMIN: + case AMDGPU::G_ATOMICRMW_FMAX: case AMDGPU::G_ATOMICRMW_UINC_WRAP: case AMDGPU::G_ATOMICRMW_UDEC_WRAP: - case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: - case AMDGPU::G_AMDGPU_ATOMIC_FMIN: - case AMDGPU::G_AMDGPU_ATOMIC_FMAX: { + case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: { OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index c436e03..c607437 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -945,6 +945,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, ISD::ATOMIC_LOAD_UMIN, ISD::ATOMIC_LOAD_UMAX, ISD::ATOMIC_LOAD_FADD, + ISD::ATOMIC_LOAD_FMIN, + ISD::ATOMIC_LOAD_FMAX, ISD::ATOMIC_LOAD_UINC_WRAP, ISD::ATOMIC_LOAD_UDEC_WRAP, ISD::INTRINSIC_VOID, @@ -8707,25 +8709,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { MemSDNode *M = cast(Op); - unsigned Opc; - switch (IntrID) { - case Intrinsic::amdgcn_ds_fmin: - Opc = AMDGPUISD::ATOMIC_LOAD_FMIN; - break; - case Intrinsic::amdgcn_ds_fmax: - Opc = AMDGPUISD::ATOMIC_LOAD_FMAX; - break; - default: - llvm_unreachable("Unknown intrinsic!"); - } - SDValue Ops[] = { - M->getOperand(0), // Chain - M->getOperand(2), // Ptr - M->getOperand(3) // Value - }; - - return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops, - M->getMemoryVT(), M->getMemOperand()); + unsigned Opc = IntrID == Intrinsic::amdgcn_ds_fmin ? ISD::ATOMIC_LOAD_FMIN + : ISD::ATOMIC_LOAD_FMAX; + return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(), M->getOperand(0), + M->getOperand(2), M->getOperand(3), + M->getMemOperand()); } case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_ptr_buffer_load: @@ -9130,22 +9118,21 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmin_num: { - Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN; + Opcode = ISD::ATOMIC_LOAD_FMIN; break; } case Intrinsic::amdgcn_global_atomic_fmax: case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmax_num: { - Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX; + Opcode = ISD::ATOMIC_LOAD_FMAX; break; } default: llvm_unreachable("unhandled atomic opcode"); } - return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op), - M->getVTList(), Ops, M->getMemoryVT(), - M->getMemOperand()); + return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(), + Ops, M->getMemOperand()); } case Intrinsic::amdgcn_s_get_barrier_state: { SDValue Chain = Op->getOperand(0); @@ -15816,8 +15803,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N, case ISD::INTRINSIC_W_CHAIN: return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1)); case AMDGPUISD::ATOMIC_CMP_SWAP: - case AMDGPUISD::ATOMIC_LOAD_FMIN: - case AMDGPUISD::ATOMIC_LOAD_FMAX: case AMDGPUISD::BUFFER_ATOMIC_SWAP: case AMDGPUISD::BUFFER_ATOMIC_ADD: case AMDGPUISD::BUFFER_ATOMIC_SUB: @@ -16077,17 +16062,21 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { return AtomicExpansionKind::CmpXChg; } case AtomicRMWInst::FMin: - case AtomicRMWInst::FMax: + case AtomicRMWInst::FMax: { + Type *Ty = RMW->getType(); + + // LDS float and double fmin/fmax were always supported. + if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy())) + return AtomicExpansionKind::None; + + return AtomicExpansionKind::CmpXChg; + } case AtomicRMWInst::Min: case AtomicRMWInst::Max: case AtomicRMWInst::UMin: case AtomicRMWInst::UMax: { if (AMDGPU::isFlatGlobalAddrSpace(AS) || AS == AMDGPUAS::BUFFER_FAT_POINTER) { - if (RMW->getType()->isFloatTy() && - unsafeFPAtomicsDisabled(RMW->getFunction())) - return AtomicExpansionKind::CmpXChg; - // Always expand system scope min/max atomics. if (HasSystemScope) return AtomicExpansionKind::CmpXChg; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 9b9ff4a..80c6235 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -72,14 +72,6 @@ def SDTAtomic2_f32 : SDTypeProfile<1, 2, [ SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1> ]>; -def SIatomic_fmin : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMIN", SDTAtomic2_f32, - [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] ->; - -def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32, - [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] ->; - // load_d16_{lo|hi} ptr, tied_input def SIload_d16 : SDTypeProfile<1, 2, [ SDTCisPtrTy<1>, @@ -314,13 +306,6 @@ class isIntType { } //===----------------------------------------------------------------------===// -// PatFrags for global memory operations -//===----------------------------------------------------------------------===// - -defm atomic_load_fmin : binary_atomic_op_fp_all_as; -defm atomic_load_fmax : binary_atomic_op_fp_all_as; - -//===----------------------------------------------------------------------===// // SDNodes PatFrags for loads/stores with a glue input. // This is for SDNodes and PatFrag for local loads and stores to // enable s_mov_b32 m0, -1 to be glued to the memory instructions. @@ -742,8 +727,8 @@ defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">; defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">; defm atomic_swap : SIAtomicM0Glue2 <"SWAP">; defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32, 0>; -defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32, 0>; -defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32, 0>; +defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 0, SDTAtomic2_f32, 0>; +defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 0, SDTAtomic2_f32, 0>; def as_i1timm : SDNodeXFormgetTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index e32bb8f..531b23d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3863,11 +3863,6 @@ def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction { let mayStore = 1; } -let Namespace = "AMDGPU" in { -def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP; -def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP; -} - class BufferAtomicGenericInstruction : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$dst); let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index ef635fd..e6a439e 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -590,9 +590,7 @@ bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc) { } bool isGenericAtomic(unsigned Opc) { - return Opc == AMDGPU::G_AMDGPU_ATOMIC_FMIN || - Opc == AMDGPU::G_AMDGPU_ATOMIC_FMAX || - Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP || + return Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP || Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD || Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB || Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN || diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir index 6581c5c..f2ba7f8 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir @@ -98,12 +98,13 @@ body: | %2:_(s32) = IMPLICIT_DEF %3:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7 %4:_(s32) = G_CONSTANT i32 0 + %ptr_lds:_(p3) = G_IMPLICIT_DEF - ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_AMDGPU_ATOMIC_FMIN - %5:_(s32) = G_AMDGPU_ATOMIC_FMIN %0, %3 + ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_FMIN + %5:_(s32) = G_ATOMICRMW_FMIN %ptr_lds, %4 - ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_AMDGPU_ATOMIC_FMAX - %6:_(s32) = G_AMDGPU_ATOMIC_FMAX %0, %3 + ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_FMAX + %6:_(s32) = G_ATOMICRMW_FMAX %ptr_lds, %4 ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_AMDGPU_BUFFER_ATOMIC_SWAP %7:_(s32) = G_AMDGPU_BUFFER_ATOMIC_SWAP %0, %3, %4, %4, %4, 0, 0, 0 :: (volatile dereferenceable load store (s32), align 1, addrspace 8) diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 6c227c7..9a6ac12 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -21,212 +21,82 @@ define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v1, v2, v2 -; GFX12-NEXT: v_max_num_f32_e32 v1, 4.0, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 +; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB0_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_f32: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v1, v0 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX940-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX940-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX940-NEXT: ds_max_rtn_f32 v0, v0, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB0_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_ret_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX11-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 +; GFX11-NEXT: ds_max_rtn_f32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB0_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_ret_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v1, v0 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX10-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX10-NEXT: ds_max_rtn_f32 v0, v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_ret_f32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: ds_max_rtn_f32 v0, v0, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX90A-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_f32: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v1, v0 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX908-NEXT: ds_max_rtn_f32 v0, v0, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB0_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_ret_f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v1, v0 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: ds_max_rtn_f32 v0, v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB0_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmax_ret_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v1, v0 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: ds_max_rtn_f32 v0, v0, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX7-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB0_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_ret_f32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX6-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: ds_max_rtn_f32 v0, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB0_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(3) %ptr, float 4.0 seq_cst ret float %result @@ -240,212 +110,83 @@ define float @local_atomic_fmax_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 offset:65532 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v1, v2, v2 -; GFX12-NEXT: v_max_num_f32_e32 v1, 4.0, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65532 +; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB1_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_f32__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX940-NEXT: ds_max_rtn_f32 v0, v0, v1 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX940-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX940-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB1_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_ret_f32__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 offset:65532 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX11-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65532 +; GFX11-NEXT: ds_max_rtn_f32 v0, v0, v1 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB1_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_ret_f32__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX10-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 +; GFX10-NEXT: ds_max_rtn_f32 v0, v0, v1 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB1_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_ret_f32__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX90A-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 +; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: ds_max_rtn_f32 v0, v0, v1 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_f32__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 +; GFX908-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX908-NEXT: ds_max_rtn_f32 v0, v0, v1 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB1_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_ret_f32__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 +; GFX8-NEXT: ds_max_rtn_f32 v0, v0, v1 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB1_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmax_ret_f32__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX7-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 +; GFX7-NEXT: ds_max_rtn_f32 v0, v0, v1 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB1_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_ret_f32__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v0, v1 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; GFX6-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0 +; GFX6-NEXT: ds_max_rtn_f32 v0, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB1_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmax ptr addrspace(3) %gep, float 4.0 seq_cst @@ -460,203 +201,82 @@ define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v1, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v2, 4.0, v2 +; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 +; GFX12-NEXT: ds_max_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB2_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_f32: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v1, v0 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX940-NEXT: ds_max_f32 v0, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB2_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_noret_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v2, 4.0, v2 +; GFX11-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 +; GFX11-NEXT: ds_max_f32 v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB2_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_noret_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v1, v0 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX10-NEXT: v_max_f32_e32 v2, 4.0, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX10-NEXT: ds_max_f32 v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB2_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_noret_f32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: ds_max_f32 v0, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_noret_f32: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v1, v0 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX908-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX908-NEXT: ds_max_f32 v0, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB2_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_noret_f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v1, v0 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX8-NEXT: ds_max_f32 v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB2_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmax_noret_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v1, v0 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: ds_max_f32 v0, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX7-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB2_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_noret_f32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: ds_max_f32 v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX6-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB2_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(3) %ptr, float 4.0 seq_cst ret void @@ -670,204 +290,83 @@ define void @local_atomic_fmax_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 offset:65532 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v1, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v2, 4.0, v2 +; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65532 +; GFX12-NEXT: ds_max_num_f32 v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB3_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_f32__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX940-NEXT: ds_max_f32 v0, v1 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB3_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_noret_f32__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 offset:65532 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v2, 4.0, v2 +; GFX11-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65532 +; GFX11-NEXT: ds_max_f32 v0, v1 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB3_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_noret_f32__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX10-NEXT: v_max_f32_e32 v2, 4.0, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 +; GFX10-NEXT: ds_max_f32 v0, v1 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB3_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_noret_f32__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 +; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: ds_max_f32 v0, v1 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_noret_f32__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 +; GFX908-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX908-NEXT: ds_max_f32 v0, v1 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB3_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_noret_f32__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: ds_max_f32 v0, v1 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB3_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmax_noret_f32__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX7-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 +; GFX7-NEXT: ds_max_f32 v0, v1 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB3_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_noret_f32__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX6-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: ds_max_f32 v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB3_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(3) %ptr, i32 16383 %unused = atomicrmw fmax ptr addrspace(3) %gep, float 4.0 seq_cst @@ -886,217 +385,90 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v0 -; GFX12-NEXT: ds_load_b64 v[0:1], v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[3:4], v[3:4] -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], 4.0, v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4] +; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB4_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v0 -; GFX940-NEXT: ds_read_b64 v[0:1], v0 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX940-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_ret_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: ds_load_b64 v[0:1], v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4] +; GFX11-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB4_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_ret_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: ds_read_b64 v[0:1], v0 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX10-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB4_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_ret_f64: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: ds_read_b64 v[0:1], v0 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX90A-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX90A-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1] -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_f64: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: ds_read_b64 v[0:1], v0 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX908-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v1, 0 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX908-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB4_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_ret_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: ds_read_b64 v[0:1], v0 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX8-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB4_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmax_ret_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: ds_read_b64 v[0:1], v0 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, v0 -; GFX7-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX7-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX7-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB4_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_ret_f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: ds_read_b64 v[0:1], v0 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, v0 -; GFX6-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX6-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX6-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB4_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(3) %ptr, double 4.0 seq_cst ret double %result @@ -1110,217 +482,91 @@ define double @local_atomic_fmax_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v0 -; GFX12-NEXT: ds_load_b64 v[0:1], v0 offset:65528 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[3:4], v[3:4] -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], 4.0, v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4] offset:65528 +; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_f64__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v0 -; GFX940-NEXT: ds_read_b64 v[0:1], v0 offset:65528 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] offset:65528 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX940-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1] offset:65528 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_ret_f64__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:65528 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4] offset:65528 +; GFX11-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2] offset:65528 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_ret_f64__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: ds_read_b64 v[0:1], v0 offset:65528 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] offset:65528 +; GFX10-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2] offset:65528 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB5_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_ret_f64__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: ds_read_b64 v[0:1], v0 offset:65528 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX90A-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1] offset:65528 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX90A-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] offset:65528 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_f64__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: ds_read_b64 v[0:1], v0 offset:65528 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_mov_b32_e32 v1, 0 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX908-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2] offset:65528 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX908-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] offset:65528 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_ret_f64__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: ds_read_b64 v[0:1], v0 offset:65528 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX8-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] offset:65528 +; GFX8-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2] offset:65528 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmax_ret_f64__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: ds_read_b64 v[0:1], v0 offset:65528 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, v0 -; GFX7-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX7-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] offset:65528 +; GFX7-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2] offset:65528 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_ret_f64__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfff8, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b64 v[0:1], v2 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, v0 -; GFX6-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX6-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX6-NEXT: ds_max_rtn_f64 v[0:1], v2, v[0:1] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB5_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(3) %ptr, i32 8191 %result = atomicrmw fmax ptr addrspace(3) %gep, double 4.0 seq_cst @@ -1335,208 +581,90 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b64 v[1:2], v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[3:4], v[1:2], v[1:2] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[3:4], 4.0, v[3:4] +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] +; GFX12-NEXT: ds_max_num_f64 v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b64 v[2:3], v0 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], 4.0 -; GFX940-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: ds_max_f64 v0, v[2:3] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB6_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_noret_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b64 v[1:2], v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] +; GFX11-NEXT: ds_max_f64 v0, v[1:2] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_noret_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b64 v[1:2], v0 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX10-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX10-NEXT: ds_max_f64 v0, v[1:2] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB6_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_noret_f64: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b64 v[2:3], v0 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], 4.0 -; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX90A-NEXT: ds_max_f64 v0, v[2:3] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_noret_f64: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b64 v[1:2], v0 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_mov_b32_e32 v1, 0 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX908-NEXT: ds_max_f64 v0, v[1:2] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX908-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX908-NEXT: v_mov_b32_e32 v1, v3 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_noret_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b64 v[1:2], v0 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX8-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX8-NEXT: ds_max_f64 v0, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmax_noret_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b64 v[1:2], v0 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: ds_max_f64 v0, v[1:2] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX7-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX7-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB6_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_noret_f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b64 v[1:2], v0 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX6-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX6-NEXT: ds_max_f64 v0, v[1:2] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX6-NEXT: v_mov_b32_e32 v1, v3 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v2, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB6_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(3) %ptr, double 4.0 seq_cst ret void @@ -1550,209 +678,91 @@ define void @local_atomic_fmax_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b64 v[1:2], v0 offset:65528 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[3:4], v[1:2], v[1:2] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[3:4], 4.0, v[3:4] +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528 +; GFX12-NEXT: ds_max_num_f64 v0, v[1:2] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_f64__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b64 v[2:3], v0 offset:65528 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], 4.0 -; GFX940-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] offset:65528 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: ds_max_f64 v0, v[2:3] offset:65528 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_noret_f64__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b64 v[1:2], v0 offset:65528 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528 +; GFX11-NEXT: ds_max_f64 v0, v[1:2] offset:65528 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_noret_f64__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b64 v[1:2], v0 offset:65528 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX10-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 +; GFX10-NEXT: ds_max_f64 v0, v[1:2] offset:65528 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_noret_f64__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b64 v[2:3], v0 offset:65528 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], 4.0 -; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] offset:65528 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX90A-NEXT: ds_max_f64 v0, v[2:3] offset:65528 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_noret_f64__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b64 v[1:2], v0 offset:65528 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_mov_b32_e32 v1, 0 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX908-NEXT: ds_max_f64 v0, v[1:2] offset:65528 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX908-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX908-NEXT: v_mov_b32_e32 v1, v3 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_noret_f64__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b64 v[1:2], v0 offset:65528 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX8-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 +; GFX8-NEXT: ds_max_f64 v0, v[1:2] offset:65528 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmax_noret_f64__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b64 v[1:2], v0 offset:65528 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: ds_max_f64 v0, v[1:2] offset:65528 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX7-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX7-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_noret_f64__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfff8, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b64 v[0:1], v2 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_max_f64 v[3:4], v[0:1], v[0:1] -; GFX6-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v2, v[0:1], v[3:4] +; GFX6-NEXT: ds_max_f64 v2, v[0:1] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[0:1] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB7_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(3) %ptr, i32 8191 %unused = atomicrmw fmax ptr addrspace(3) %gep, double 4.0 seq_cst @@ -8032,212 +7042,82 @@ define float @local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v1, v2, v2 -; GFX12-NEXT: v_max_num_f32_e32 v1, 4.0, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 +; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v1, v0 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX940-NEXT: ds_max_rtn_f32 v0, v0, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX940-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX940-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX11-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 +; GFX11-NEXT: ds_max_rtn_f32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v1, v0 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX10-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX10-NEXT: ds_max_rtn_f32 v0, v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB28_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX90A-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: ds_max_rtn_f32 v0, v0, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v1, v0 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX908-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX908-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX908-NEXT: ds_max_rtn_f32 v0, v0, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v1, v0 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX8-NEXT: ds_max_rtn_f32 v0, v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB28_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v1, v0 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: ds_max_rtn_f32 v0, v0, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX7-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB28_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX6-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: ds_max_rtn_f32 v0, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB28_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(3) %ptr, float 4.0 seq_cst, !amdgpu.ignore.denormal.mode !0 ret float %result @@ -8251,203 +7131,82 @@ define void @local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v1, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v2, 4.0, v2 +; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 +; GFX12-NEXT: ds_max_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v1, v0 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX940-NEXT: ds_max_f32 v0, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v2, 4.0, v2 +; GFX11-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 +; GFX11-NEXT: ds_max_f32 v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v1, v0 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX10-NEXT: v_max_f32_e32 v2, 4.0, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX10-NEXT: ds_max_f32 v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB29_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: ds_max_f32 v0, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v1, v0 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX908-NEXT: ds_max_f32 v0, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB29_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v1, v0 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX8-NEXT: ds_max_f32 v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB29_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v1, v0 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: ds_max_f32 v0, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX7-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB29_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX6-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: ds_max_f32 v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB29_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(3) %ptr, float 4.0 seq_cst, !amdgpu.ignore.denormal.mode !0 ret void diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index c4c70d9..4bc9404 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -21,212 +21,82 @@ define float @local_atomic_fmin_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v1, v2, v2 -; GFX12-NEXT: v_min_num_f32_e32 v1, 4.0, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 +; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB0_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_f32: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v1, v0 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX940-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX940-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX940-NEXT: ds_min_rtn_f32 v0, v0, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB0_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_ret_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX11-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 +; GFX11-NEXT: ds_min_rtn_f32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB0_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_ret_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v1, v0 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX10-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_ret_f32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: ds_min_rtn_f32 v0, v0, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX90A-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_f32: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v1, v0 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX908-NEXT: ds_min_rtn_f32 v0, v0, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX908-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB0_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_ret_f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v1, v0 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: ds_min_rtn_f32 v0, v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB0_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmin_ret_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v1, v0 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: ds_min_rtn_f32 v0, v0, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX7-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB0_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_ret_f32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX6-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: ds_min_rtn_f32 v0, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB0_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(3) %ptr, float 4.0 seq_cst ret float %result @@ -240,212 +110,83 @@ define float @local_atomic_fmin_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 offset:65532 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v1, v2, v2 -; GFX12-NEXT: v_min_num_f32_e32 v1, 4.0, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65532 +; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB1_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_f32__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX940-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX940-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX940-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB1_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_ret_f32__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 offset:65532 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX11-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65532 +; GFX11-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB1_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_ret_f32__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX10-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 +; GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB1_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_ret_f32__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX90A-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 +; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_f32__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX908-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 +; GFX908-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX908-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB1_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_ret_f32__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 +; GFX8-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB1_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmin_ret_f32__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX7-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 +; GFX7-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB1_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_ret_f32__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffc, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v0, v1 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; GFX6-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0 +; GFX6-NEXT: ds_min_rtn_f32 v0, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB1_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmin ptr addrspace(3) %gep, float 4.0 seq_cst @@ -460,203 +201,82 @@ define void @local_atomic_fmin_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v1, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v2, 4.0, v2 +; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 +; GFX12-NEXT: ds_min_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB2_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_f32: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v1, v0 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX940-NEXT: ds_min_f32 v0, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB2_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_noret_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v2, 4.0, v2 +; GFX11-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 +; GFX11-NEXT: ds_min_f32 v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB2_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_noret_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v1, v0 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX10-NEXT: v_min_f32_e32 v2, 4.0, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX10-NEXT: ds_min_f32 v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB2_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_noret_f32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: ds_min_f32 v0, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_noret_f32: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v1, v0 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX908-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX908-NEXT: ds_min_f32 v0, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB2_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_noret_f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v1, v0 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX8-NEXT: ds_min_f32 v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB2_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmin_noret_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v1, v0 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: ds_min_f32 v0, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX7-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB2_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_noret_f32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: ds_min_f32 v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX6-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB2_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(3) %ptr, float 4.0 seq_cst ret void @@ -670,204 +290,83 @@ define void @local_atomic_fmin_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 offset:65532 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v1, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v2, 4.0, v2 +; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65532 +; GFX12-NEXT: ds_min_num_f32 v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB3_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_f32__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX940-NEXT: ds_min_f32 v0, v1 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB3_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_noret_f32__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 offset:65532 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v2, 4.0, v2 +; GFX11-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65532 +; GFX11-NEXT: ds_min_f32 v0, v1 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB3_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_noret_f32__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX10-NEXT: v_min_f32_e32 v2, 4.0, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 +; GFX10-NEXT: ds_min_f32 v0, v1 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB3_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_noret_f32__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 +; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: ds_min_f32 v0, v1 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_noret_f32__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 +; GFX908-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX908-NEXT: ds_min_f32 v0, v1 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB3_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_noret_f32__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: ds_min_f32 v0, v1 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB3_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmin_noret_f32__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v1, v0 offset:65532 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX7-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532 +; GFX7-NEXT: ds_min_f32 v0, v1 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB3_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_noret_f32__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX6-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: ds_min_f32 v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB3_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(3) %ptr, i32 16383 %unused = atomicrmw fmin ptr addrspace(3) %gep, float 4.0 seq_cst @@ -886,217 +385,90 @@ define double @local_atomic_fmin_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v0 -; GFX12-NEXT: ds_load_b64 v[0:1], v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[3:4], v[3:4] -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], 4.0, v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4] +; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB4_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v0 -; GFX940-NEXT: ds_read_b64 v[0:1], v0 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX940-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_ret_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: ds_load_b64 v[0:1], v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4] +; GFX11-NEXT: ds_min_rtn_f64 v[0:1], v0, v[1:2] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB4_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_ret_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: ds_read_b64 v[0:1], v0 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX10-NEXT: ds_min_rtn_f64 v[0:1], v0, v[1:2] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB4_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_ret_f64: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: ds_read_b64 v[0:1], v0 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX90A-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX90A-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1] -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_f64: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: ds_read_b64 v[0:1], v0 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX908-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v1, 0 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX908-NEXT: ds_min_rtn_f64 v[0:1], v0, v[1:2] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB4_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_ret_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: ds_read_b64 v[0:1], v0 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: ds_min_rtn_f64 v[0:1], v0, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX8-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB4_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmin_ret_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: ds_read_b64 v[0:1], v0 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, v0 -; GFX7-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX7-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX7-NEXT: ds_min_rtn_f64 v[0:1], v0, v[1:2] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB4_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_ret_f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: ds_read_b64 v[0:1], v0 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, v0 -; GFX6-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX6-NEXT: ds_min_rtn_f64 v[0:1], v0, v[1:2] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB4_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(3) %ptr, double 4.0 seq_cst ret double %result @@ -1110,217 +482,91 @@ define double @local_atomic_fmin_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v0 -; GFX12-NEXT: ds_load_b64 v[0:1], v0 offset:65528 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[3:4], v[3:4] -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], 4.0, v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4] offset:65528 +; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_f64__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v0 -; GFX940-NEXT: ds_read_b64 v[0:1], v0 offset:65528 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] offset:65528 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX940-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1] offset:65528 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_ret_f64__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:65528 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4] offset:65528 +; GFX11-NEXT: ds_min_rtn_f64 v[0:1], v0, v[1:2] offset:65528 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_ret_f64__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: ds_read_b64 v[0:1], v0 offset:65528 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] offset:65528 +; GFX10-NEXT: ds_min_rtn_f64 v[0:1], v0, v[1:2] offset:65528 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB5_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_ret_f64__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: ds_read_b64 v[0:1], v0 offset:65528 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX90A-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1] offset:65528 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX90A-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] offset:65528 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_f64__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: ds_read_b64 v[0:1], v0 offset:65528 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_mov_b32_e32 v1, 0 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX908-NEXT: ds_min_rtn_f64 v[0:1], v0, v[1:2] offset:65528 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX908-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] offset:65528 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_ret_f64__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: ds_read_b64 v[0:1], v0 offset:65528 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX8-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] offset:65528 +; GFX8-NEXT: ds_min_rtn_f64 v[0:1], v0, v[1:2] offset:65528 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmin_ret_f64__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: ds_read_b64 v[0:1], v0 offset:65528 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, v0 -; GFX7-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX7-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] offset:65528 +; GFX7-NEXT: ds_min_rtn_f64 v[0:1], v0, v[1:2] offset:65528 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_ret_f64__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfff8, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b64 v[0:1], v2 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, v0 -; GFX6-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4] -; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX6-NEXT: ds_min_rtn_f64 v[0:1], v2, v[0:1] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB5_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(3) %ptr, i32 8191 %result = atomicrmw fmin ptr addrspace(3) %gep, double 4.0 seq_cst @@ -1335,208 +581,90 @@ define void @local_atomic_fmin_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b64 v[1:2], v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[3:4], v[1:2], v[1:2] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[3:4], 4.0, v[3:4] +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] +; GFX12-NEXT: ds_min_num_f64 v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b64 v[2:3], v0 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], 4.0 -; GFX940-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: ds_min_f64 v0, v[2:3] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB6_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_noret_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b64 v[1:2], v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] +; GFX11-NEXT: ds_min_f64 v0, v[1:2] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_noret_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b64 v[1:2], v0 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX10-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX10-NEXT: ds_min_f64 v0, v[1:2] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB6_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_noret_f64: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b64 v[2:3], v0 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], 4.0 -; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX90A-NEXT: ds_min_f64 v0, v[2:3] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_noret_f64: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b64 v[1:2], v0 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_mov_b32_e32 v1, 0 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX908-NEXT: ds_min_f64 v0, v[1:2] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX908-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX908-NEXT: v_mov_b32_e32 v1, v3 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_noret_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b64 v[1:2], v0 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX8-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX8-NEXT: ds_min_f64 v0, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmin_noret_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b64 v[1:2], v0 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: ds_min_f64 v0, v[1:2] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX7-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX7-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB6_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_noret_f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b64 v[1:2], v0 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX6-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX6-NEXT: ds_min_f64 v0, v[1:2] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX6-NEXT: v_mov_b32_e32 v1, v3 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v2, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB6_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(3) %ptr, double 4.0 seq_cst ret void @@ -1550,209 +678,91 @@ define void @local_atomic_fmin_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b64 v[1:2], v0 offset:65528 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[3:4], v[1:2], v[1:2] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[3:4], 4.0, v[3:4] +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528 +; GFX12-NEXT: ds_min_num_f64 v0, v[1:2] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_f64__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b64 v[2:3], v0 offset:65528 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], 4.0 -; GFX940-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] offset:65528 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: ds_min_f64 v0, v[2:3] offset:65528 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_noret_f64__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b64 v[1:2], v0 offset:65528 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528 +; GFX11-NEXT: ds_min_f64 v0, v[1:2] offset:65528 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_noret_f64__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b64 v[1:2], v0 offset:65528 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX10-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 +; GFX10-NEXT: ds_min_f64 v0, v[1:2] offset:65528 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_noret_f64__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b64 v[2:3], v0 offset:65528 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], 4.0 -; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] offset:65528 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX90A-NEXT: ds_min_f64 v0, v[2:3] offset:65528 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_noret_f64__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b64 v[1:2], v0 offset:65528 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_mov_b32_e32 v1, 0 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX908-NEXT: ds_min_f64 v0, v[1:2] offset:65528 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX908-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX908-NEXT: v_mov_b32_e32 v1, v3 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_noret_f64__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b64 v[1:2], v0 offset:65528 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX8-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 +; GFX8-NEXT: ds_min_f64 v0, v[1:2] offset:65528 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmin_noret_f64__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0x40100000 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b64 v[1:2], v0 offset:65528 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: ds_min_f64 v0, v[1:2] offset:65528 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX7-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX7-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_noret_f64__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfff8, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b64 v[0:1], v2 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_max_f64 v[3:4], v[0:1], v[0:1] -; GFX6-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v2, v[0:1], v[3:4] +; GFX6-NEXT: ds_min_f64 v2, v[0:1] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[0:1] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB7_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(3) %ptr, i32 8191 %unused = atomicrmw fmin ptr addrspace(3) %gep, double 4.0 seq_cst @@ -8032,212 +7042,82 @@ define float @local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v1, v2, v2 -; GFX12-NEXT: v_min_num_f32_e32 v1, 4.0, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 +; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v1, v0 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX940-NEXT: ds_min_rtn_f32 v0, v0, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX940-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX940-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX11-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 +; GFX11-NEXT: ds_min_rtn_f32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v1, v0 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX10-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB28_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX90A-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: ds_min_rtn_f32 v0, v0, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v1, v0 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX908-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX908-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX908-NEXT: ds_min_rtn_f32 v0, v0, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v1, v0 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX8-NEXT: ds_min_rtn_f32 v0, v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB28_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v1, v0 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: ds_min_rtn_f32 v0, v0, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX7-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB28_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX6-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: ds_min_rtn_f32 v0, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB28_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(3) %ptr, float 4.0 seq_cst, !amdgpu.ignore.denormal.mode !0 ret float %result @@ -8251,203 +7131,82 @@ define void @local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v1, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v2, 4.0, v2 +; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 +; GFX12-NEXT: ds_min_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v1, v0 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX940-NEXT: ds_min_f32 v0, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v2, 4.0, v2 +; GFX11-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 +; GFX11-NEXT: ds_min_f32 v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v1, v0 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX10-NEXT: v_min_f32_e32 v2, 4.0, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX10-NEXT: ds_min_f32 v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB29_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: ds_min_f32 v0, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v1, v0 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX908-NEXT: ds_min_f32 v0, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB29_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v1, v0 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX8-NEXT: ds_min_f32 v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB29_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v1, v0 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: ds_min_f32 v0, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX7-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB29_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX6-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: ds_min_f32 v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB29_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(3) %ptr, float 4.0 seq_cst, !amdgpu.ignore.denormal.mode !0 ret void diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll index 5a79bc2..387bec7 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll @@ -46,20 +46,8 @@ define float @test_atomicrmw_fmax_f32_global(ptr addrspace(1) %ptr, float %value define float @test_atomicrmw_fmax_f32_local(ptr addrspace(3) %ptr, float %value) { ; GCN-LABEL: @test_atomicrmw_fmax_f32_local( -; GCN-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(3) [[PTR:%.*]], align 4 -; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] -; GCN: atomicrmw.start: -; GCN-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; GCN-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE:%.*]]) -; GCN-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; GCN-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 -; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; GCN-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GCN: atomicrmw.end: -; GCN-NEXT: ret float [[TMP6]] +; GCN-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4 +; GCN-NEXT: ret float [[RES]] ; %res = atomicrmw fmax ptr addrspace(3) %ptr, float %value seq_cst ret float %res @@ -242,20 +230,8 @@ define double @test_atomicrmw_fmax_f64_global(ptr addrspace(1) %ptr, double %val define double @test_atomicrmw_fmax_f64_local(ptr addrspace(3) %ptr, double %value) { ; GCN-LABEL: @test_atomicrmw_fmax_f64_local( -; GCN-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 -; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] -; GCN: atomicrmw.start: -; GCN-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; GCN-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE:%.*]]) -; GCN-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 -; GCN-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 -; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 -; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 -; GCN-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double -; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GCN: atomicrmw.end: -; GCN-NEXT: ret double [[TMP6]] +; GCN-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(3) [[PTR:%.*]], double [[VALUE:%.*]] seq_cst, align 8 +; GCN-NEXT: ret double [[RES]] ; %res = atomicrmw fmax ptr addrspace(3) %ptr, double %value seq_cst ret double %res diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll index e3d3bfd..e7c8faa 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll @@ -46,20 +46,8 @@ define float @test_atomicrmw_fmin_f32_global(ptr addrspace(1) %ptr, float %value define float @test_atomicrmw_fmin_f32_local(ptr addrspace(3) %ptr, float %value) { ; GCN-LABEL: @test_atomicrmw_fmin_f32_local( -; GCN-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(3) [[PTR:%.*]], align 4 -; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] -; GCN: atomicrmw.start: -; GCN-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; GCN-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE:%.*]]) -; GCN-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 -; GCN-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 -; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; GCN-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GCN: atomicrmw.end: -; GCN-NEXT: ret float [[TMP6]] +; GCN-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4 +; GCN-NEXT: ret float [[RES]] ; %res = atomicrmw fmin ptr addrspace(3) %ptr, float %value seq_cst ret float %res @@ -242,20 +230,8 @@ define double @test_atomicrmw_fmin_f64_global(ptr addrspace(1) %ptr, double %val define double @test_atomicrmw_fmin_f64_local(ptr addrspace(3) %ptr, double %value) { ; GCN-LABEL: @test_atomicrmw_fmin_f64_local( -; GCN-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 -; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] -; GCN: atomicrmw.start: -; GCN-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; GCN-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE:%.*]]) -; GCN-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 -; GCN-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 -; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 -; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 -; GCN-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double -; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GCN: atomicrmw.end: -; GCN-NEXT: ret double [[TMP6]] +; GCN-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(3) [[PTR:%.*]], double [[VALUE:%.*]] seq_cst, align 8 +; GCN-NEXT: ret double [[RES]] ; %res = atomicrmw fmin ptr addrspace(3) %ptr, double %value seq_cst ret double %res -- cgit v1.1