aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2024-06-18 18:34:34 +0200
committerGitHub <noreply@github.com>2024-06-18 18:34:34 +0200
commit8520061281b0475bf4767107ddc94cf13335db48 (patch)
treed5dbc78ddb2664c80f5ea18aaeb019c5365f30eb
parentae6f730b2f6f2055b3a658235ddef91624d532f2 (diff)
downloadllvm-8520061281b0475bf4767107ddc94cf13335db48.zip
llvm-8520061281b0475bf4767107ddc94cf13335db48.tar.gz
llvm-8520061281b0475bf4767107ddc94cf13335db48.tar.bz2
AMDGPU: Support local atomicrmw fmin/fmax for float/double (#95590)
This has always been supported. Somehow, we ended up with 2 copies of clang builtins for this case, and the newer one erroneously requires gfx8-insts.
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructions.td2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp51
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td19
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td5
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp4
-rw-r--r--llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir9
-rw-r--r--llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll1667
-rw-r--r--llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll1667
-rw-r--r--llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll32
-rw-r--r--llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll32
17 files changed, 479 insertions, 3043 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index d81c188..537d3a4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -271,11 +271,8 @@ def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>;
// FIXME: Check MMO is atomic
def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap_glue>;
def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap_glue>;
-def : GINodeEquiv<G_AMDGPU_ATOMIC_FMIN, SIatomic_fmin>;
-def : GINodeEquiv<G_AMDGPU_ATOMIC_FMAX, SIatomic_fmax>;
-def : GINodeEquiv<G_AMDGPU_ATOMIC_FMIN, atomic_load_fmin_glue>;
-def : GINodeEquiv<G_AMDGPU_ATOMIC_FMAX, atomic_load_fmax_glue>;
-
+def : GINodeEquiv<G_ATOMICRMW_FMIN, atomic_load_fmin_glue>;
+def : GINodeEquiv<G_ATOMICRMW_FMAX, atomic_load_fmax_glue>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SWAP, SIbuffer_atomic_swap>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_ADD, SIbuffer_atomic_add>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b50c0cc..d60c62a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -502,9 +502,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
// isa<MemSDNode> almost works but is slightly too permissive for some DS
// intrinsics.
- if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
- Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
- Opc == AMDGPUISD::ATOMIC_LOAD_FMAX) {
+ if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
N = glueCopyToM0LDSInit(N);
SelectCode(N);
return;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 519e623..522b3a3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5524,8 +5524,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
NODE_NAME_CASE(DS_ORDERED_COUNT)
NODE_NAME_CASE(ATOMIC_CMP_SWAP)
- NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
- NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
NODE_NAME_CASE(BUFFER_LOAD)
NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
NODE_NAME_CASE(BUFFER_LOAD_USHORT)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 206bb46..37572af 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -575,8 +575,6 @@ enum NodeType : unsigned {
TBUFFER_LOAD_FORMAT_D16,
DS_ORDERED_COUNT,
ATOMIC_CMP_SWAP,
- ATOMIC_LOAD_FMIN,
- ATOMIC_LOAD_FMAX,
BUFFER_LOAD,
BUFFER_LOAD_UBYTE,
BUFFER_LOAD_USHORT,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index ae3f2b8..03e2d62 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3620,8 +3620,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
case TargetOpcode::G_ATOMICRMW_FADD:
- case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
- case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
+ case TargetOpcode::G_ATOMICRMW_FMIN:
+ case TargetOpcode::G_ATOMICRMW_FMAX:
return selectG_LOAD_STORE_ATOMICRMW(I);
case TargetOpcode::G_SELECT:
return selectG_SELECT(I);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index ebc6402..21f541d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -685,6 +685,8 @@ defm atomic_load_umax : binary_atomic_op_all_as<atomic_load_umax>;
defm atomic_load_umin : binary_atomic_op_all_as<atomic_load_umin>;
defm atomic_load_xor : binary_atomic_op_all_as<atomic_load_xor>;
defm atomic_load_fadd : binary_atomic_op_fp_all_as<atomic_load_fadd>;
+defm atomic_load_fmin : binary_atomic_op_fp_all_as<atomic_load_fmin>;
+defm atomic_load_fmax : binary_atomic_op_fp_all_as<atomic_load_fmax>;
defm atomic_load_uinc_wrap : binary_atomic_op_all_as<atomic_load_uinc_wrap>;
defm atomic_load_udec_wrap : binary_atomic_op_all_as<atomic_load_udec_wrap>;
defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 0c7b196..4ff945e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -283,7 +283,9 @@ static const LLT S1 = LLT::scalar(1);
static const LLT S8 = LLT::scalar(8);
static const LLT S16 = LLT::scalar(16);
static const LLT S32 = LLT::scalar(32);
+static const LLT F32 = LLT::float32();
static const LLT S64 = LLT::scalar(64);
+static const LLT F64 = LLT::float64();
static const LLT S96 = LLT::scalar(96);
static const LLT S128 = LLT::scalar(128);
static const LLT S160 = LLT::scalar(160);
@@ -1648,6 +1650,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.hasFlatAtomicFaddF32Inst())
Atomic.legalFor({{S32, FlatPtr}});
+ getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
+ .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
+
if (ST.hasGFX90AInsts()) {
// These are legal with some caveats, and should have undergone expansion in
// the IR in most situations
@@ -5401,9 +5406,9 @@ static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
case Intrinsic::amdgcn_ds_fadd:
return AMDGPU::G_ATOMICRMW_FADD;
case Intrinsic::amdgcn_ds_fmin:
- return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
+ return AMDGPU::G_ATOMICRMW_FMIN;
case Intrinsic::amdgcn_ds_fmax:
- return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
+ return AMDGPU::G_ATOMICRMW_FMAX;
default:
llvm_unreachable("not a DS FP intrinsic");
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 313d53a..0510a1d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5219,11 +5219,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_ATOMICRMW_UMAX:
case AMDGPU::G_ATOMICRMW_UMIN:
case AMDGPU::G_ATOMICRMW_FADD:
+ case AMDGPU::G_ATOMICRMW_FMIN:
+ case AMDGPU::G_ATOMICRMW_FMAX:
case AMDGPU::G_ATOMICRMW_UINC_WRAP:
case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
- case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
- case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
- case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
+ case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c436e03..c607437 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -945,6 +945,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::ATOMIC_LOAD_UMIN,
ISD::ATOMIC_LOAD_UMAX,
ISD::ATOMIC_LOAD_FADD,
+ ISD::ATOMIC_LOAD_FMIN,
+ ISD::ATOMIC_LOAD_FMAX,
ISD::ATOMIC_LOAD_UINC_WRAP,
ISD::ATOMIC_LOAD_UDEC_WRAP,
ISD::INTRINSIC_VOID,
@@ -8707,25 +8709,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
MemSDNode *M = cast<MemSDNode>(Op);
- unsigned Opc;
- switch (IntrID) {
- case Intrinsic::amdgcn_ds_fmin:
- Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
- break;
- case Intrinsic::amdgcn_ds_fmax:
- Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
- break;
- default:
- llvm_unreachable("Unknown intrinsic!");
- }
- SDValue Ops[] = {
- M->getOperand(0), // Chain
- M->getOperand(2), // Ptr
- M->getOperand(3) // Value
- };
-
- return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
- M->getMemoryVT(), M->getMemOperand());
+ unsigned Opc = IntrID == Intrinsic::amdgcn_ds_fmin ? ISD::ATOMIC_LOAD_FMIN
+ : ISD::ATOMIC_LOAD_FMAX;
+ return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(), M->getOperand(0),
+ M->getOperand(2), M->getOperand(3),
+ M->getMemOperand());
}
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_ptr_buffer_load:
@@ -9130,22 +9118,21 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_global_atomic_fmin_num:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmin_num: {
- Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN;
+ Opcode = ISD::ATOMIC_LOAD_FMIN;
break;
}
case Intrinsic::amdgcn_global_atomic_fmax:
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmax_num: {
- Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX;
+ Opcode = ISD::ATOMIC_LOAD_FMAX;
break;
}
default:
llvm_unreachable("unhandled atomic opcode");
}
- return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op),
- M->getVTList(), Ops, M->getMemoryVT(),
- M->getMemOperand());
+ return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
+ Ops, M->getMemOperand());
}
case Intrinsic::amdgcn_s_get_barrier_state: {
SDValue Chain = Op->getOperand(0);
@@ -15816,8 +15803,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
case ISD::INTRINSIC_W_CHAIN:
return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
case AMDGPUISD::ATOMIC_CMP_SWAP:
- case AMDGPUISD::ATOMIC_LOAD_FMIN:
- case AMDGPUISD::ATOMIC_LOAD_FMAX:
case AMDGPUISD::BUFFER_ATOMIC_SWAP:
case AMDGPUISD::BUFFER_ATOMIC_ADD:
case AMDGPUISD::BUFFER_ATOMIC_SUB:
@@ -16077,17 +16062,21 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
return AtomicExpansionKind::CmpXChg;
}
case AtomicRMWInst::FMin:
- case AtomicRMWInst::FMax:
+ case AtomicRMWInst::FMax: {
+ Type *Ty = RMW->getType();
+
+ // LDS float and double fmin/fmax were always supported.
+ if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
+ return AtomicExpansionKind::None;
+
+ return AtomicExpansionKind::CmpXChg;
+ }
case AtomicRMWInst::Min:
case AtomicRMWInst::Max:
case AtomicRMWInst::UMin:
case AtomicRMWInst::UMax: {
if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
- if (RMW->getType()->isFloatTy() &&
- unsafeFPAtomicsDisabled(RMW->getFunction()))
- return AtomicExpansionKind::CmpXChg;
-
// Always expand system scope min/max atomics.
if (HasSystemScope)
return AtomicExpansionKind::CmpXChg;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 9b9ff4a..80c6235 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -72,14 +72,6 @@ def SDTAtomic2_f32 : SDTypeProfile<1, 2, [
SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1>
]>;
-def SIatomic_fmin : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMIN", SDTAtomic2_f32,
- [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
->;
-
-def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32,
- [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
->;
-
// load_d16_{lo|hi} ptr, tied_input
def SIload_d16 : SDTypeProfile<1, 2, [
SDTCisPtrTy<1>,
@@ -314,13 +306,6 @@ class isIntType<ValueType SrcVT> {
}
//===----------------------------------------------------------------------===//
-// PatFrags for global memory operations
-//===----------------------------------------------------------------------===//
-
-defm atomic_load_fmin : binary_atomic_op_fp_all_as<SIatomic_fmin>;
-defm atomic_load_fmax : binary_atomic_op_fp_all_as<SIatomic_fmax>;
-
-//===----------------------------------------------------------------------===//
// SDNodes PatFrags for loads/stores with a glue input.
// This is for SDNodes and PatFrag for local loads and stores to
// enable s_mov_b32 m0, -1 to be glued to the memory instructions.
@@ -742,8 +727,8 @@ defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
defm atomic_swap : SIAtomicM0Glue2 <"SWAP">;
defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32, 0>;
-defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32, 0>;
-defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32, 0>;
+defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 0, SDTAtomic2_f32, 0>;
+defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 0, SDTAtomic2_f32, 0>;
def as_i1timm : SDNodeXForm<timm, [{
return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index e32bb8f..531b23d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3863,11 +3863,6 @@ def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction {
let mayStore = 1;
}
-let Namespace = "AMDGPU" in {
-def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP;
-def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP;
-}
-
class BufferAtomicGenericInstruction : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index ef635fd..e6a439e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -590,9 +590,7 @@ bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc) {
}
bool isGenericAtomic(unsigned Opc) {
- return Opc == AMDGPU::G_AMDGPU_ATOMIC_FMIN ||
- Opc == AMDGPU::G_AMDGPU_ATOMIC_FMAX ||
- Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP ||
+ return Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP ||
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD ||
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB ||
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN ||
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
index 6581c5c..f2ba7f8 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
@@ -98,12 +98,13 @@ body: |
%2:_(s32) = IMPLICIT_DEF
%3:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7
%4:_(s32) = G_CONSTANT i32 0
+ %ptr_lds:_(p3) = G_IMPLICIT_DEF
- ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_AMDGPU_ATOMIC_FMIN
- %5:_(s32) = G_AMDGPU_ATOMIC_FMIN %0, %3
+ ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_FMIN
+ %5:_(s32) = G_ATOMICRMW_FMIN %ptr_lds, %4
- ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_AMDGPU_ATOMIC_FMAX
- %6:_(s32) = G_AMDGPU_ATOMIC_FMAX %0, %3
+ ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_FMAX
+ %6:_(s32) = G_ATOMICRMW_FMAX %ptr_lds, %4
; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_AMDGPU_BUFFER_ATOMIC_SWAP
%7:_(s32) = G_AMDGPU_BUFFER_ATOMIC_SWAP %0, %3, %4, %4, %4, 0, 0, 0 :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
index 6c227c7..9a6ac12 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
@@ -21,212 +21,82 @@ define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v1, v2, v2
-; GFX12-NEXT: v_max_num_f32_e32 v1, 4.0, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2
+; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB0_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f32:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_read_b32 v1, v0
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX940-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX940-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX940-NEXT: ds_max_rtn_f32 v0, v0, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB0_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmax_ret_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX11-NEXT: v_max_f32_e32 v1, 4.0, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, 4.0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2
+; GFX11-NEXT: ds_max_rtn_f32 v0, v0, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB0_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_ret_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b32 v1, v0
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: v_max_f32_e32 v1, 4.0, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, 4.0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX10-NEXT: ds_max_rtn_f32 v0, v0, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB0_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: local_atomic_fmax_ret_f32:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v1, v0
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX90A-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX90A-NEXT: ds_max_rtn_f32 v0, v0, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB0_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmax_ret_f32:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b32 v1, v0
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v1
-; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX908-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX908-NEXT: ds_max_rtn_f32 v0, v0, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB0_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: local_atomic_fmax_ret_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 4.0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v1, v0
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: ds_max_rtn_f32 v0, v0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v1
-; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX8-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB0_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: local_atomic_fmax_ret_f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, 4.0
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_b32 v1, v0
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, v1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX7-NEXT: ds_max_rtn_f32 v0, v0, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB0_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmax_ret_f32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, 4.0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v1, v0
-; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: ds_max_rtn_f32 v0, v0, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX6-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB0_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr addrspace(3) %ptr, float 4.0 seq_cst
ret float %result
@@ -240,212 +110,83 @@ define float @local_atomic_fmax_ret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0 offset:65532
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v1, v2, v2
-; GFX12-NEXT: v_max_num_f32_e32 v1, 4.0, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65532
+; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB1_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f32__offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX940-NEXT: ds_max_rtn_f32 v0, v0, v1 offset:65532
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX940-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX940-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB1_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmax_ret_f32__offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0 offset:65532
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX11-NEXT: v_max_f32_e32 v1, 4.0, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, 4.0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65532
+; GFX11-NEXT: ds_max_rtn_f32 v0, v0, v1 offset:65532
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB1_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_ret_f32__offset:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: v_max_f32_e32 v1, 4.0, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, 4.0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
+; GFX10-NEXT: ds_max_rtn_f32 v0, v0, v1 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB1_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: local_atomic_fmax_ret_f32__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX90A-NEXT: ds_max_rtn_f32 v0, v0, v1 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX90A-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB1_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmax_ret_f32__offset:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v1
-; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
+; GFX908-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX908-NEXT: ds_max_rtn_f32 v0, v0, v1 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB1_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: local_atomic_fmax_ret_f32__offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 4.0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: ds_max_rtn_f32 v0, v0, v1 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v1
-; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX8-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB1_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: local_atomic_fmax_ret_f32__offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, 4.0
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, v1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
+; GFX7-NEXT: ds_max_rtn_f32 v0, v0, v1 offset:65532
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB1_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmax_ret_f32__offset:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffc, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0
+; GFX6-NEXT: v_mov_b32_e32 v1, 4.0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v0, v1
-; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: ds_max_rtn_f32 v0, v0, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v2
-; GFX6-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
-; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB1_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(3) %ptr, i32 16383
%result = atomicrmw fmax ptr addrspace(3) %gep, float 4.0 seq_cst
@@ -460,203 +201,82 @@ define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v1, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, 4.0, v2
+; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1
+; GFX12-NEXT: ds_max_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB2_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f32:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_read_b32 v1, v0
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX940-NEXT: v_max_f32_e32 v2, 4.0, v2
-; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX940-NEXT: ds_max_f32 v0, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB2_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmax_noret_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, 4.0, v2
+; GFX11-NEXT: v_mov_b32_e32 v1, 4.0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1
+; GFX11-NEXT: ds_max_f32 v0, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB2_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_noret_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b32 v1, v0
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX10-NEXT: v_max_f32_e32 v2, 4.0, v2
+; GFX10-NEXT: v_mov_b32_e32 v1, 4.0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX10-NEXT: ds_max_f32 v0, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB2_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: local_atomic_fmax_noret_f32:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v1, v0
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX90A-NEXT: ds_max_f32 v0, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX90A-NEXT: v_max_f32_e32 v2, 4.0, v2
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB2_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmax_noret_f32:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b32 v1, v0
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX908-NEXT: ds_max_f32 v0, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX908-NEXT: v_max_f32_e32 v2, 4.0, v2
-; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v2
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB2_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: local_atomic_fmax_noret_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 4.0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v1, v0
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX8-NEXT: v_max_f32_e32 v2, 4.0, v2
-; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX8-NEXT: ds_max_f32 v0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v2
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB2_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: local_atomic_fmax_noret_f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, 4.0
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_b32 v1, v0
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX7-NEXT: v_max_f32_e32 v2, 4.0, v2
-; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX7-NEXT: ds_max_f32 v0, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB2_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmax_noret_f32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, 4.0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v1, v0
-; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: ds_max_f32 v0, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX6-NEXT: v_max_f32_e32 v2, 4.0, v2
-; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v2
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB2_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr addrspace(3) %ptr, float 4.0 seq_cst
ret void
@@ -670,204 +290,83 @@ define void @local_atomic_fmax_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0 offset:65532
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v1, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, 4.0, v2
+; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65532
+; GFX12-NEXT: ds_max_num_f32 v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB3_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f32__offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX940-NEXT: ds_max_f32 v0, v1 offset:65532
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX940-NEXT: v_max_f32_e32 v2, 4.0, v2
-; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB3_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmax_noret_f32__offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0 offset:65532
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, 4.0, v2
+; GFX11-NEXT: v_mov_b32_e32 v1, 4.0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65532
+; GFX11-NEXT: ds_max_f32 v0, v1 offset:65532
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB3_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_noret_f32__offset:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX10-NEXT: v_max_f32_e32 v2, 4.0, v2
+; GFX10-NEXT: v_mov_b32_e32 v1, 4.0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
+; GFX10-NEXT: ds_max_f32 v0, v1 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB3_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: local_atomic_fmax_noret_f32__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX90A-NEXT: ds_max_f32 v0, v1 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX90A-NEXT: v_max_f32_e32 v2, 4.0, v2
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB3_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmax_noret_f32__offset:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX908-NEXT: v_max_f32_e32 v2, 4.0, v2
-; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
+; GFX908-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX908-NEXT: ds_max_f32 v0, v1 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v2
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB3_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: local_atomic_fmax_noret_f32__offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 4.0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: ds_max_f32 v0, v1 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX8-NEXT: v_max_f32_e32 v2, 4.0, v2
-; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v2
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB3_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: local_atomic_fmax_noret_f32__offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, 4.0
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: ds_max_f32 v0, v1 offset:65532
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX7-NEXT: v_max_f32_e32 v2, 4.0, v2
-; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB3_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmax_noret_f32__offset:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0
+; GFX6-NEXT: v_mov_b32_e32 v1, 4.0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v1, v0
-; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX6-NEXT: v_max_f32_e32 v2, 4.0, v2
-; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX6-NEXT: ds_max_f32 v0, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v2
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB3_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(3) %ptr, i32 16383
%unused = atomicrmw fmax ptr addrspace(3) %gep, float 4.0 seq_cst
@@ -886,217 +385,90 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v0
-; GFX12-NEXT: ds_load_b64 v[0:1], v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[3:4], v[3:4]
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], 4.0, v[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4]
+; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB4_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f64:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v0
-; GFX940-NEXT: ds_read_b64 v[0:1], v0
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
-; GFX940-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX940-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB4_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmax_ret_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
-; GFX11-NEXT: ds_load_b64 v[0:1], v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4]
+; GFX11-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2]
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB4_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_ret_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v0
-; GFX10-NEXT: ds_read_b64 v[0:1], v0
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v1
-; GFX10-NEXT: v_mov_b32_e32 v3, v0
-; GFX10-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1]
+; GFX10-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2]
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB4_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: local_atomic_fmax_ret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
-; GFX90A-NEXT: ds_read_b64 v[0:1], v0
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
+; GFX90A-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX90A-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1]
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB4_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmax_ret_f64:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
-; GFX908-NEXT: ds_read_b64 v[0:1], v0
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v1
-; GFX908-NEXT: v_mov_b32_e32 v3, v0
-; GFX908-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX908-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1]
+; GFX908-NEXT: v_mov_b32_e32 v1, 0
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x40100000
+; GFX908-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2]
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB4_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: local_atomic_fmax_ret_f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: ds_read_b64 v[0:1], v0
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, v0
-; GFX8-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX8-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1]
+; GFX8-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB4_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: local_atomic_fmax_ret_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: ds_read_b64 v[0:1], v0
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX7-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1]
+; GFX7-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB4_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmax_ret_f64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: v_mov_b32_e32 v2, v0
-; GFX6-NEXT: ds_read_b64 v[0:1], v0
-; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
-; GFX6-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX6-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX6-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1]
+; GFX6-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2]
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
-; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB4_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr addrspace(3) %ptr, double 4.0 seq_cst
ret double %result
@@ -1110,217 +482,91 @@ define double @local_atomic_fmax_ret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v0
-; GFX12-NEXT: ds_load_b64 v[0:1], v0 offset:65528
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[3:4], v[3:4]
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], 4.0, v[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4] offset:65528
+; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2] offset:65528
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB5_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f64__offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v0
-; GFX940-NEXT: ds_read_b64 v[0:1], v0 offset:65528
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
-; GFX940-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX940-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1] offset:65528
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] offset:65528
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB5_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmax_ret_f64__offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
-; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:65528
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4] offset:65528
+; GFX11-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2] offset:65528
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB5_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_ret_f64__offset:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v0
-; GFX10-NEXT: ds_read_b64 v[0:1], v0 offset:65528
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v1
-; GFX10-NEXT: v_mov_b32_e32 v3, v0
-; GFX10-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] offset:65528
+; GFX10-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2] offset:65528
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB5_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: local_atomic_fmax_ret_f64__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
-; GFX90A-NEXT: ds_read_b64 v[0:1], v0 offset:65528
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX90A-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1] offset:65528
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
+; GFX90A-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] offset:65528
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB5_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmax_ret_f64__offset:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
-; GFX908-NEXT: ds_read_b64 v[0:1], v0 offset:65528
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v1
-; GFX908-NEXT: v_mov_b32_e32 v3, v0
-; GFX908-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX908-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] offset:65528
+; GFX908-NEXT: v_mov_b32_e32 v1, 0
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x40100000
+; GFX908-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2] offset:65528
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB5_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: local_atomic_fmax_ret_f64__offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: ds_read_b64 v[0:1], v0 offset:65528
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, v0
-; GFX8-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX8-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] offset:65528
+; GFX8-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2] offset:65528
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB5_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: local_atomic_fmax_ret_f64__offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: ds_read_b64 v[0:1], v0 offset:65528
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX7-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] offset:65528
+; GFX7-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2] offset:65528
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB5_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmax_ret_f64__offset:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfff8, v0
+; GFX6-NEXT: v_mov_b32_e32 v0, 0
+; GFX6-NEXT: v_mov_b32_e32 v1, 0x40100000
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b64 v[0:1], v2
-; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: ds_max_rtn_f64 v[0:1], v2, v[0:1]
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
-; GFX6-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX6-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX6-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1]
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
-; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB5_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr addrspace(3) %ptr, i32 8191
%result = atomicrmw fmax ptr addrspace(3) %gep, double 4.0 seq_cst
@@ -1335,208 +581,90 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b64 v[1:2], v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[3:4], v[1:2], v[1:2]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[3:4], 4.0, v[3:4]
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2]
+; GFX12-NEXT: ds_max_num_f64 v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB6_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f64:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_read_b64 v[2:3], v0
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT: ds_max_f64 v0, v[2:3]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], 4.0
-; GFX940-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5]
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB6_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmax_noret_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b64 v[1:2], v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2]
+; GFX11-NEXT: ds_max_f64 v0, v[1:2]
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB6_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_noret_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b64 v[1:2], v0
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX10-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
+; GFX10-NEXT: ds_max_f64 v0, v[1:2]
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX10-NEXT: v_mov_b32_e32 v1, v3
-; GFX10-NEXT: v_mov_b32_e32 v2, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB6_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: local_atomic_fmax_noret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b64 v[2:3], v0
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
+; GFX90A-NEXT: ds_max_f64 v0, v[2:3]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], 4.0
-; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5]
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB6_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmax_noret_f64:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b64 v[1:2], v0
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_mov_b32_e32 v1, 0
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x40100000
+; GFX908-NEXT: ds_max_f64 v0, v[1:2]
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX908-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX908-NEXT: v_mov_b32_e32 v1, v3
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v2, v4
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB6_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: local_atomic_fmax_noret_f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b64 v[1:2], v0
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: ds_max_f64 v0, v[1:2]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX8-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v1, v3
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v4
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB6_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: local_atomic_fmax_noret_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_b64 v[1:2], v0
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: ds_max_f64 v0, v[1:2]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX7-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX7-NEXT: v_mov_b32_e32 v1, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v2, v4
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB6_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmax_noret_f64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b64 v[1:2], v0
-; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX6-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
+; GFX6-NEXT: ds_max_f64 v0, v[1:2]
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX6-NEXT: v_mov_b32_e32 v1, v3
-; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v2, v4
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB6_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr addrspace(3) %ptr, double 4.0 seq_cst
ret void
@@ -1550,209 +678,91 @@ define void @local_atomic_fmax_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b64 v[1:2], v0 offset:65528
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[3:4], v[1:2], v[1:2]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[3:4], 4.0, v[3:4]
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528
+; GFX12-NEXT: ds_max_num_f64 v0, v[1:2] offset:65528
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB7_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f64__offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_read_b64 v[2:3], v0 offset:65528
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT: ds_max_f64 v0, v[2:3] offset:65528
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], 4.0
-; GFX940-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] offset:65528
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB7_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmax_noret_f64__offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b64 v[1:2], v0 offset:65528
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528
+; GFX11-NEXT: ds_max_f64 v0, v[1:2] offset:65528
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB7_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_noret_f64__offset:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b64 v[1:2], v0 offset:65528
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX10-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
+; GFX10-NEXT: ds_max_f64 v0, v[1:2] offset:65528
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX10-NEXT: v_mov_b32_e32 v1, v3
-; GFX10-NEXT: v_mov_b32_e32 v2, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB7_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: local_atomic_fmax_noret_f64__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b64 v[2:3], v0 offset:65528
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
+; GFX90A-NEXT: ds_max_f64 v0, v[2:3] offset:65528
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], 4.0
-; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] offset:65528
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB7_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmax_noret_f64__offset:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b64 v[1:2], v0 offset:65528
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_mov_b32_e32 v1, 0
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x40100000
+; GFX908-NEXT: ds_max_f64 v0, v[1:2] offset:65528
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX908-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX908-NEXT: v_mov_b32_e32 v1, v3
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v2, v4
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB7_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: local_atomic_fmax_noret_f64__offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b64 v[1:2], v0 offset:65528
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX8-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
+; GFX8-NEXT: ds_max_f64 v0, v[1:2] offset:65528
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v1, v3
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v4
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB7_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: local_atomic_fmax_noret_f64__offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_b64 v[1:2], v0 offset:65528
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX7-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
+; GFX7-NEXT: ds_max_f64 v0, v[1:2] offset:65528
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX7-NEXT: v_mov_b32_e32 v1, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v2, v4
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB7_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmax_noret_f64__offset:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfff8, v0
+; GFX6-NEXT: v_mov_b32_e32 v0, 0
+; GFX6-NEXT: v_mov_b32_e32 v1, 0x40100000
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b64 v[0:1], v2
-; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_max_f64 v[3:4], v[0:1], v[0:1]
-; GFX6-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
-; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v2, v[0:1], v[3:4]
+; GFX6-NEXT: ds_max_f64 v2, v[0:1]
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[0:1]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
-; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v4
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB7_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr addrspace(3) %ptr, i32 8191
%unused = atomicrmw fmax ptr addrspace(3) %gep, double 4.0 seq_cst
@@ -8032,212 +7042,82 @@ define float @local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v1, v2, v2
-; GFX12-NEXT: v_max_num_f32_e32 v1, 4.0, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2
+; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB28_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_read_b32 v1, v0
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX940-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX940-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX940-NEXT: ds_max_rtn_f32 v0, v0, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB28_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX11-NEXT: v_max_f32_e32 v1, 4.0, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, 4.0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2
+; GFX11-NEXT: ds_max_rtn_f32 v0, v0, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB28_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b32 v1, v0
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: v_max_f32_e32 v1, 4.0, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, 4.0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX10-NEXT: ds_max_rtn_f32 v0, v0, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB28_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v1, v0
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX90A-NEXT: ds_max_rtn_f32 v0, v0, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX90A-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB28_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b32 v1, v0
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v1
-; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX908-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX908-NEXT: ds_max_rtn_f32 v0, v0, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB28_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 4.0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v1, v0
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v1
-; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX8-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX8-NEXT: ds_max_rtn_f32 v0, v0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB28_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, 4.0
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_b32 v1, v0
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: ds_max_rtn_f32 v0, v0, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, v1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB28_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, 4.0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v1, v0
-; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX6-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: ds_max_rtn_f32 v0, v0, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB28_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr addrspace(3) %ptr, float 4.0 seq_cst, !amdgpu.ignore.denormal.mode !0
ret float %result
@@ -8251,203 +7131,82 @@ define void @local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v1, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, 4.0, v2
+; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1
+; GFX12-NEXT: ds_max_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB29_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_read_b32 v1, v0
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX940-NEXT: v_max_f32_e32 v2, 4.0, v2
-; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX940-NEXT: ds_max_f32 v0, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB29_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, 4.0, v2
+; GFX11-NEXT: v_mov_b32_e32 v1, 4.0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1
+; GFX11-NEXT: ds_max_f32 v0, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB29_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b32 v1, v0
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX10-NEXT: v_max_f32_e32 v2, 4.0, v2
+; GFX10-NEXT: v_mov_b32_e32 v1, 4.0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX10-NEXT: ds_max_f32 v0, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB29_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v1, v0
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX90A-NEXT: ds_max_f32 v0, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX90A-NEXT: v_max_f32_e32 v2, 4.0, v2
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB29_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b32 v1, v0
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX908-NEXT: ds_max_f32 v0, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX908-NEXT: v_max_f32_e32 v2, 4.0, v2
-; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v2
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB29_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 4.0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v1, v0
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: ds_max_f32 v0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX8-NEXT: v_max_f32_e32 v2, 4.0, v2
-; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v2
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB29_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, 4.0
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_b32 v1, v0
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: ds_max_f32 v0, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX7-NEXT: v_max_f32_e32 v2, 4.0, v2
-; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB29_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, 4.0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v1, v0
-; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX6-NEXT: v_max_f32_e32 v2, 4.0, v2
-; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX6-NEXT: ds_max_f32 v0, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v2
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB29_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr addrspace(3) %ptr, float 4.0 seq_cst, !amdgpu.ignore.denormal.mode !0
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
index c4c70d9..4bc9404 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
@@ -21,212 +21,82 @@ define float @local_atomic_fmin_ret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v1, v2, v2
-; GFX12-NEXT: v_min_num_f32_e32 v1, 4.0, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2
+; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB0_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f32:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_read_b32 v1, v0
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX940-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX940-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX940-NEXT: ds_min_rtn_f32 v0, v0, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB0_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmin_ret_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX11-NEXT: v_min_f32_e32 v1, 4.0, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, 4.0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2
+; GFX11-NEXT: ds_min_rtn_f32 v0, v0, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB0_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_ret_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b32 v1, v0
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: v_min_f32_e32 v1, 4.0, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, 4.0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB0_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: local_atomic_fmin_ret_f32:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v1, v0
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX90A-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX90A-NEXT: ds_min_rtn_f32 v0, v0, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB0_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmin_ret_f32:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b32 v1, v0
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v1
-; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX908-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX908-NEXT: ds_min_rtn_f32 v0, v0, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB0_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: local_atomic_fmin_ret_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 4.0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v1, v0
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: ds_min_rtn_f32 v0, v0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v1
-; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX8-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB0_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: local_atomic_fmin_ret_f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, 4.0
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_b32 v1, v0
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, v1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX7-NEXT: ds_min_rtn_f32 v0, v0, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB0_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmin_ret_f32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, 4.0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v1, v0
-; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: ds_min_rtn_f32 v0, v0, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX6-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB0_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr addrspace(3) %ptr, float 4.0 seq_cst
ret float %result
@@ -240,212 +110,83 @@ define float @local_atomic_fmin_ret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0 offset:65532
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v1, v2, v2
-; GFX12-NEXT: v_min_num_f32_e32 v1, 4.0, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65532
+; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB1_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f32__offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX940-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:65532
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX940-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX940-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB1_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmin_ret_f32__offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0 offset:65532
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX11-NEXT: v_min_f32_e32 v1, 4.0, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, 4.0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65532
+; GFX11-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:65532
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB1_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_ret_f32__offset:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: v_min_f32_e32 v1, 4.0, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, 4.0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
+; GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB1_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: local_atomic_fmin_ret_f32__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX90A-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX90A-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB1_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmin_ret_f32__offset:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v1
-; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
+; GFX908-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX908-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB1_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: local_atomic_fmin_ret_f32__offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 4.0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v1
-; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX8-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB1_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: local_atomic_fmin_ret_f32__offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, 4.0
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, v1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
+; GFX7-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:65532
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB1_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmin_ret_f32__offset:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffc, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0
+; GFX6-NEXT: v_mov_b32_e32 v1, 4.0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v0, v1
-; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: ds_min_rtn_f32 v0, v0, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v2
-; GFX6-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
-; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB1_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(3) %ptr, i32 16383
%result = atomicrmw fmin ptr addrspace(3) %gep, float 4.0 seq_cst
@@ -460,203 +201,82 @@ define void @local_atomic_fmin_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v1, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, 4.0, v2
+; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1
+; GFX12-NEXT: ds_min_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB2_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f32:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_read_b32 v1, v0
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX940-NEXT: v_min_f32_e32 v2, 4.0, v2
-; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX940-NEXT: ds_min_f32 v0, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB2_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmin_noret_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, 4.0, v2
+; GFX11-NEXT: v_mov_b32_e32 v1, 4.0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1
+; GFX11-NEXT: ds_min_f32 v0, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB2_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_noret_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b32 v1, v0
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX10-NEXT: v_min_f32_e32 v2, 4.0, v2
+; GFX10-NEXT: v_mov_b32_e32 v1, 4.0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX10-NEXT: ds_min_f32 v0, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB2_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: local_atomic_fmin_noret_f32:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v1, v0
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX90A-NEXT: ds_min_f32 v0, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX90A-NEXT: v_min_f32_e32 v2, 4.0, v2
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB2_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmin_noret_f32:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b32 v1, v0
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX908-NEXT: ds_min_f32 v0, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX908-NEXT: v_min_f32_e32 v2, 4.0, v2
-; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v2
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB2_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: local_atomic_fmin_noret_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 4.0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v1, v0
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX8-NEXT: v_min_f32_e32 v2, 4.0, v2
-; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX8-NEXT: ds_min_f32 v0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v2
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB2_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: local_atomic_fmin_noret_f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, 4.0
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_b32 v1, v0
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX7-NEXT: v_min_f32_e32 v2, 4.0, v2
-; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX7-NEXT: ds_min_f32 v0, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB2_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmin_noret_f32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, 4.0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v1, v0
-; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: ds_min_f32 v0, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX6-NEXT: v_min_f32_e32 v2, 4.0, v2
-; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v2
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB2_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr addrspace(3) %ptr, float 4.0 seq_cst
ret void
@@ -670,204 +290,83 @@ define void @local_atomic_fmin_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0 offset:65532
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v1, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, 4.0, v2
+; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65532
+; GFX12-NEXT: ds_min_num_f32 v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB3_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f32__offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX940-NEXT: ds_min_f32 v0, v1 offset:65532
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX940-NEXT: v_min_f32_e32 v2, 4.0, v2
-; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB3_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmin_noret_f32__offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0 offset:65532
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, 4.0, v2
+; GFX11-NEXT: v_mov_b32_e32 v1, 4.0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65532
+; GFX11-NEXT: ds_min_f32 v0, v1 offset:65532
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB3_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_noret_f32__offset:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX10-NEXT: v_min_f32_e32 v2, 4.0, v2
+; GFX10-NEXT: v_mov_b32_e32 v1, 4.0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
+; GFX10-NEXT: ds_min_f32 v0, v1 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB3_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: local_atomic_fmin_noret_f32__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX90A-NEXT: ds_min_f32 v0, v1 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX90A-NEXT: v_min_f32_e32 v2, 4.0, v2
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB3_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmin_noret_f32__offset:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX908-NEXT: v_min_f32_e32 v2, 4.0, v2
-; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
+; GFX908-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX908-NEXT: ds_min_f32 v0, v1 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v2
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB3_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: local_atomic_fmin_noret_f32__offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 4.0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: ds_min_f32 v0, v1 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX8-NEXT: v_min_f32_e32 v2, 4.0, v2
-; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v2
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB3_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: local_atomic_fmin_noret_f32__offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, 4.0
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_b32 v1, v0 offset:65532
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: ds_min_f32 v0, v1 offset:65532
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX7-NEXT: v_min_f32_e32 v2, 4.0, v2
-; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB3_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmin_noret_f32__offset:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0
+; GFX6-NEXT: v_mov_b32_e32 v1, 4.0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v1, v0
-; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX6-NEXT: v_min_f32_e32 v2, 4.0, v2
-; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX6-NEXT: ds_min_f32 v0, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v2
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB3_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(3) %ptr, i32 16383
%unused = atomicrmw fmin ptr addrspace(3) %gep, float 4.0 seq_cst
@@ -886,217 +385,90 @@ define double @local_atomic_fmin_ret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v0
-; GFX12-NEXT: ds_load_b64 v[0:1], v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[3:4], v[3:4]
-; GFX12-NEXT: v_min_num_f64_e32 v[0:1], 4.0, v[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4]
+; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB4_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f64:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v0
-; GFX940-NEXT: ds_read_b64 v[0:1], v0
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
-; GFX940-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX940-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB4_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmin_ret_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
-; GFX11-NEXT: ds_load_b64 v[0:1], v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4]
+; GFX11-NEXT: ds_min_rtn_f64 v[0:1], v0, v[1:2]
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB4_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_ret_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v0
-; GFX10-NEXT: ds_read_b64 v[0:1], v0
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v1
-; GFX10-NEXT: v_mov_b32_e32 v3, v0
-; GFX10-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1]
+; GFX10-NEXT: ds_min_rtn_f64 v[0:1], v0, v[1:2]
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB4_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: local_atomic_fmin_ret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
-; GFX90A-NEXT: ds_read_b64 v[0:1], v0
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
+; GFX90A-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX90A-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1]
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB4_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmin_ret_f64:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
-; GFX908-NEXT: ds_read_b64 v[0:1], v0
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v1
-; GFX908-NEXT: v_mov_b32_e32 v3, v0
-; GFX908-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX908-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1]
+; GFX908-NEXT: v_mov_b32_e32 v1, 0
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x40100000
+; GFX908-NEXT: ds_min_rtn_f64 v[0:1], v0, v[1:2]
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB4_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: local_atomic_fmin_ret_f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: ds_read_b64 v[0:1], v0
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, v0
-; GFX8-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX8-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1]
+; GFX8-NEXT: ds_min_rtn_f64 v[0:1], v0, v[1:2]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB4_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: local_atomic_fmin_ret_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: ds_read_b64 v[0:1], v0
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX7-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1]
+; GFX7-NEXT: ds_min_rtn_f64 v[0:1], v0, v[1:2]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB4_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmin_ret_f64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: v_mov_b32_e32 v2, v0
-; GFX6-NEXT: ds_read_b64 v[0:1], v0
-; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
-; GFX6-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX6-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1]
+; GFX6-NEXT: ds_min_rtn_f64 v[0:1], v0, v[1:2]
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
-; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB4_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr addrspace(3) %ptr, double 4.0 seq_cst
ret double %result
@@ -1110,217 +482,91 @@ define double @local_atomic_fmin_ret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v0
-; GFX12-NEXT: ds_load_b64 v[0:1], v0 offset:65528
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[3:4], v[3:4]
-; GFX12-NEXT: v_min_num_f64_e32 v[0:1], 4.0, v[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4] offset:65528
+; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2] offset:65528
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB5_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f64__offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v0
-; GFX940-NEXT: ds_read_b64 v[0:1], v0 offset:65528
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
-; GFX940-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX940-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1] offset:65528
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] offset:65528
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB5_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmin_ret_f64__offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
-; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:65528
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4] offset:65528
+; GFX11-NEXT: ds_min_rtn_f64 v[0:1], v0, v[1:2] offset:65528
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB5_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_ret_f64__offset:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v0
-; GFX10-NEXT: ds_read_b64 v[0:1], v0 offset:65528
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v1
-; GFX10-NEXT: v_mov_b32_e32 v3, v0
-; GFX10-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] offset:65528
+; GFX10-NEXT: ds_min_rtn_f64 v[0:1], v0, v[1:2] offset:65528
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB5_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: local_atomic_fmin_ret_f64__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
-; GFX90A-NEXT: ds_read_b64 v[0:1], v0 offset:65528
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX90A-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1] offset:65528
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
+; GFX90A-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] offset:65528
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB5_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmin_ret_f64__offset:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
-; GFX908-NEXT: ds_read_b64 v[0:1], v0 offset:65528
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v1
-; GFX908-NEXT: v_mov_b32_e32 v3, v0
-; GFX908-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX908-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] offset:65528
+; GFX908-NEXT: v_mov_b32_e32 v1, 0
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x40100000
+; GFX908-NEXT: ds_min_rtn_f64 v[0:1], v0, v[1:2] offset:65528
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB5_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: local_atomic_fmin_ret_f64__offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: ds_read_b64 v[0:1], v0 offset:65528
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, v0
-; GFX8-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX8-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] offset:65528
+; GFX8-NEXT: ds_min_rtn_f64 v[0:1], v0, v[1:2] offset:65528
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB5_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: local_atomic_fmin_ret_f64__offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: ds_read_b64 v[0:1], v0 offset:65528
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX7-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] offset:65528
+; GFX7-NEXT: ds_min_rtn_f64 v[0:1], v0, v[1:2] offset:65528
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB5_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmin_ret_f64__offset:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfff8, v0
+; GFX6-NEXT: v_mov_b32_e32 v0, 0
+; GFX6-NEXT: v_mov_b32_e32 v1, 0x40100000
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b64 v[0:1], v2
-; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: ds_min_rtn_f64 v[0:1], v2, v[0:1]
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
-; GFX6-NEXT: v_max_f64 v[0:1], v[3:4], v[3:4]
-; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX6-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1]
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
-; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB5_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr addrspace(3) %ptr, i32 8191
%result = atomicrmw fmin ptr addrspace(3) %gep, double 4.0 seq_cst
@@ -1335,208 +581,90 @@ define void @local_atomic_fmin_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b64 v[1:2], v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[3:4], v[1:2], v[1:2]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f64_e32 v[3:4], 4.0, v[3:4]
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2]
+; GFX12-NEXT: ds_min_num_f64 v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB6_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f64:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_read_b64 v[2:3], v0
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT: ds_min_f64 v0, v[2:3]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], 4.0
-; GFX940-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5]
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB6_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmin_noret_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b64 v[1:2], v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2]
+; GFX11-NEXT: ds_min_f64 v0, v[1:2]
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB6_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_noret_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b64 v[1:2], v0
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX10-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
+; GFX10-NEXT: ds_min_f64 v0, v[1:2]
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX10-NEXT: v_mov_b32_e32 v1, v3
-; GFX10-NEXT: v_mov_b32_e32 v2, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB6_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: local_atomic_fmin_noret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b64 v[2:3], v0
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
+; GFX90A-NEXT: ds_min_f64 v0, v[2:3]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], 4.0
-; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5]
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB6_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmin_noret_f64:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b64 v[1:2], v0
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_mov_b32_e32 v1, 0
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x40100000
+; GFX908-NEXT: ds_min_f64 v0, v[1:2]
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX908-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX908-NEXT: v_mov_b32_e32 v1, v3
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v2, v4
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB6_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: local_atomic_fmin_noret_f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b64 v[1:2], v0
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: ds_min_f64 v0, v[1:2]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX8-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v1, v3
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v4
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB6_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: local_atomic_fmin_noret_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_b64 v[1:2], v0
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: ds_min_f64 v0, v[1:2]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX7-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX7-NEXT: v_mov_b32_e32 v1, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v2, v4
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB6_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmin_noret_f64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b64 v[1:2], v0
-; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX6-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
+; GFX6-NEXT: ds_min_f64 v0, v[1:2]
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX6-NEXT: v_mov_b32_e32 v1, v3
-; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v2, v4
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB6_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr addrspace(3) %ptr, double 4.0 seq_cst
ret void
@@ -1550,209 +678,91 @@ define void @local_atomic_fmin_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b64 v[1:2], v0 offset:65528
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[3:4], v[1:2], v[1:2]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f64_e32 v[3:4], 4.0, v[3:4]
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528
+; GFX12-NEXT: ds_min_num_f64 v0, v[1:2] offset:65528
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB7_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f64__offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_read_b64 v[2:3], v0 offset:65528
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT: ds_min_f64 v0, v[2:3] offset:65528
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], 4.0
-; GFX940-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] offset:65528
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB7_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmin_noret_f64__offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b64 v[1:2], v0 offset:65528
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528
+; GFX11-NEXT: ds_min_f64 v0, v[1:2] offset:65528
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB7_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_noret_f64__offset:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b64 v[1:2], v0 offset:65528
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX10-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
+; GFX10-NEXT: ds_min_f64 v0, v[1:2] offset:65528
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX10-NEXT: v_mov_b32_e32 v1, v3
-; GFX10-NEXT: v_mov_b32_e32 v2, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB7_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: local_atomic_fmin_noret_f64__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b64 v[2:3], v0 offset:65528
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
+; GFX90A-NEXT: ds_min_f64 v0, v[2:3] offset:65528
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], 4.0
-; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] offset:65528
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB7_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmin_noret_f64__offset:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b64 v[1:2], v0 offset:65528
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_mov_b32_e32 v1, 0
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x40100000
+; GFX908-NEXT: ds_min_f64 v0, v[1:2] offset:65528
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX908-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX908-NEXT: v_mov_b32_e32 v1, v3
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v2, v4
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB7_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: local_atomic_fmin_noret_f64__offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b64 v[1:2], v0 offset:65528
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX8-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
+; GFX8-NEXT: ds_min_f64 v0, v[1:2] offset:65528
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v1, v3
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v4
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB7_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: local_atomic_fmin_noret_f64__offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: v_mov_b32_e32 v2, 0x40100000
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_b64 v[1:2], v0 offset:65528
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX7-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
+; GFX7-NEXT: ds_min_f64 v0, v[1:2] offset:65528
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX7-NEXT: v_mov_b32_e32 v1, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v2, v4
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB7_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmin_noret_f64__offset:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xfff8, v0
+; GFX6-NEXT: v_mov_b32_e32 v0, 0
+; GFX6-NEXT: v_mov_b32_e32 v1, 0x40100000
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b64 v[0:1], v2
-; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_max_f64 v[3:4], v[0:1], v[0:1]
-; GFX6-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
-; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v2, v[0:1], v[3:4]
+; GFX6-NEXT: ds_min_f64 v2, v[0:1]
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[0:1]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
-; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v4
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB7_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr addrspace(3) %ptr, i32 8191
%unused = atomicrmw fmin ptr addrspace(3) %gep, double 4.0 seq_cst
@@ -8032,212 +7042,82 @@ define float @local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v1, v2, v2
-; GFX12-NEXT: v_min_num_f32_e32 v1, 4.0, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2
+; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB28_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_read_b32 v1, v0
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX940-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX940-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX940-NEXT: ds_min_rtn_f32 v0, v0, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB28_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX11-NEXT: v_min_f32_e32 v1, 4.0, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, 4.0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2
+; GFX11-NEXT: ds_min_rtn_f32 v0, v0, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB28_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b32 v1, v0
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: v_min_f32_e32 v1, 4.0, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, 4.0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB28_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v1, v0
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX90A-NEXT: ds_min_rtn_f32 v0, v0, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX90A-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB28_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b32 v1, v0
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v1
-; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX908-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX908-NEXT: ds_min_rtn_f32 v0, v0, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB28_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 4.0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v1, v0
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v1
-; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX8-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX8-NEXT: ds_min_rtn_f32 v0, v0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB28_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, 4.0
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_b32 v1, v0
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: ds_min_rtn_f32 v0, v0, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, v1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB28_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, 4.0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v1, v0
-; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX6-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: ds_min_rtn_f32 v0, v0, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB28_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr addrspace(3) %ptr, float 4.0 seq_cst, !amdgpu.ignore.denormal.mode !0
ret float %result
@@ -8251,203 +7131,82 @@ define void @local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v1, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, 4.0, v2
+; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1
+; GFX12-NEXT: ds_min_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB29_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_read_b32 v1, v0
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX940-NEXT: v_min_f32_e32 v2, 4.0, v2
-; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX940-NEXT: ds_min_f32 v0, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB29_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, 4.0, v2
+; GFX11-NEXT: v_mov_b32_e32 v1, 4.0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1
+; GFX11-NEXT: ds_min_f32 v0, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB29_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b32 v1, v0
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX10-NEXT: v_min_f32_e32 v2, 4.0, v2
+; GFX10-NEXT: v_mov_b32_e32 v1, 4.0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX10-NEXT: ds_min_f32 v0, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB29_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v1, v0
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX90A-NEXT: ds_min_f32 v0, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX90A-NEXT: v_min_f32_e32 v2, 4.0, v2
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB29_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b32 v1, v0
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX908-NEXT: ds_min_f32 v0, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX908-NEXT: v_min_f32_e32 v2, 4.0, v2
-; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v2
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB29_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 4.0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v1, v0
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: ds_min_f32 v0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX8-NEXT: v_min_f32_e32 v2, 4.0, v2
-; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v2
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB29_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, 4.0
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_b32 v1, v0
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: ds_min_f32 v0, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX7-NEXT: v_min_f32_e32 v2, 4.0, v2
-; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB29_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, 4.0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v1, v0
-; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX6-NEXT: v_min_f32_e32 v2, 4.0, v2
-; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX6-NEXT: ds_min_f32 v0, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v2
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB29_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr addrspace(3) %ptr, float 4.0 seq_cst, !amdgpu.ignore.denormal.mode !0
ret void
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
index 5a79bc2..387bec7 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
@@ -46,20 +46,8 @@ define float @test_atomicrmw_fmax_f32_global(ptr addrspace(1) %ptr, float %value
define float @test_atomicrmw_fmax_f32_local(ptr addrspace(3) %ptr, float %value) {
; GCN-LABEL: @test_atomicrmw_fmax_f32_local(
-; GCN-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(3) [[PTR:%.*]], align 4
-; GCN-NEXT: br label [[ATOMICRMW_START:%.*]]
-; GCN: atomicrmw.start:
-; GCN-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; GCN-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE:%.*]])
-; GCN-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; GCN-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; GCN-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GCN: atomicrmw.end:
-; GCN-NEXT: ret float [[TMP6]]
+; GCN-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4
+; GCN-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(3) %ptr, float %value seq_cst
ret float %res
@@ -242,20 +230,8 @@ define double @test_atomicrmw_fmax_f64_global(ptr addrspace(1) %ptr, double %val
define double @test_atomicrmw_fmax_f64_local(ptr addrspace(3) %ptr, double %value) {
; GCN-LABEL: @test_atomicrmw_fmax_f64_local(
-; GCN-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8
-; GCN-NEXT: br label [[ATOMICRMW_START:%.*]]
-; GCN: atomicrmw.start:
-; GCN-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; GCN-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE:%.*]])
-; GCN-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; GCN-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
-; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; GCN-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GCN: atomicrmw.end:
-; GCN-NEXT: ret double [[TMP6]]
+; GCN-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(3) [[PTR:%.*]], double [[VALUE:%.*]] seq_cst, align 8
+; GCN-NEXT: ret double [[RES]]
;
%res = atomicrmw fmax ptr addrspace(3) %ptr, double %value seq_cst
ret double %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
index e3d3bfd..e7c8faa 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
@@ -46,20 +46,8 @@ define float @test_atomicrmw_fmin_f32_global(ptr addrspace(1) %ptr, float %value
define float @test_atomicrmw_fmin_f32_local(ptr addrspace(3) %ptr, float %value) {
; GCN-LABEL: @test_atomicrmw_fmin_f32_local(
-; GCN-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(3) [[PTR:%.*]], align 4
-; GCN-NEXT: br label [[ATOMICRMW_START:%.*]]
-; GCN: atomicrmw.start:
-; GCN-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; GCN-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE:%.*]])
-; GCN-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; GCN-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; GCN-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GCN: atomicrmw.end:
-; GCN-NEXT: ret float [[TMP6]]
+; GCN-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4
+; GCN-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(3) %ptr, float %value seq_cst
ret float %res
@@ -242,20 +230,8 @@ define double @test_atomicrmw_fmin_f64_global(ptr addrspace(1) %ptr, double %val
define double @test_atomicrmw_fmin_f64_local(ptr addrspace(3) %ptr, double %value) {
; GCN-LABEL: @test_atomicrmw_fmin_f64_local(
-; GCN-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8
-; GCN-NEXT: br label [[ATOMICRMW_START:%.*]]
-; GCN: atomicrmw.start:
-; GCN-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; GCN-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE:%.*]])
-; GCN-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; GCN-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
-; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; GCN-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GCN: atomicrmw.end:
-; GCN-NEXT: ret double [[TMP6]]
+; GCN-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(3) [[PTR:%.*]], double [[VALUE:%.*]] seq_cst, align 8
+; GCN-NEXT: ret double [[RES]]
;
%res = atomicrmw fmin ptr addrspace(3) %ptr, double %value seq_cst
ret double %res