aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2024-06-23 10:10:41 +0200
committerGitHub <noreply@github.com>2024-06-23 10:10:41 +0200
commita440a96ec2084985bca71e2b90b33bd07af3e65e (patch)
tree7ecfc29b69e23b28a3a9a8dbeb10f2222cb978da
parent3f33d2f3ca570f1e4e016a07f049724fdff6dad9 (diff)
downloadllvm-a440a96ec2084985bca71e2b90b33bd07af3e65e.zip
llvm-a440a96ec2084985bca71e2b90b33bd07af3e65e.tar.gz
llvm-a440a96ec2084985bca71e2b90b33bd07af3e65e.tar.bz2
AMDGPU: Start selecting flat/global atomicrmw fmin/fmax. (#95592)
Define subtarget features for atomic fmin/fmax support. The flat/global support is a real messe. We had float/double support at the beginning in gfx6 and gfx7. gfx8 removed these. gfx10 reintroduced them. gfx11 removed the f64 versions again. gfx9 partially reintroduced them, in gfx90a and gfx940 but only for f64.
-rw-r--r--clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu16
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td72
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td26
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td76
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h20
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp29
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll1477
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll1477
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll1912
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll1912
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll605
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll605
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll638
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll271
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll638
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll271
-rw-r--r--llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll48
-rw-r--r--llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll48
-rw-r--r--llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll48
-rw-r--r--llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll48
-rw-r--r--llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll7
-rw-r--r--llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll7
22 files changed, 2948 insertions, 7303 deletions
diff --git a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
index 57557bf..eeb23bc 100644
--- a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
@@ -49,18 +49,18 @@ __global__ void ffp2(double *p) {
// CHECK: atomicrmw fmin ptr {{.*}} monotonic
// CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic
// CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic
- // SAFE: _Z4ffp2Pd
+ // SAFE-LABEL: @_Z4ffp2Pd
// SAFE: global_atomic_cmpswap_b64
// SAFE: global_atomic_cmpswap_b64
// SAFE: global_atomic_cmpswap_b64
// SAFE: global_atomic_cmpswap_b64
// SAFE: global_atomic_cmpswap_b64
- // UNSAFE: _Z4ffp2Pd
- // UNSAFE: global_atomic_cmpswap_x2
- // UNSAFE: global_atomic_cmpswap_x2
+ // UNSAFE-LABEL: @_Z4ffp2Pd
// UNSAFE: global_atomic_cmpswap_x2
// UNSAFE: global_atomic_cmpswap_x2
// UNSAFE: global_atomic_cmpswap_x2
+ // UNSAFE: global_atomic_max_f64
+ // UNSAFE: global_atomic_min_f64
__atomic_fetch_sub(p, 1.0, memory_order_relaxed);
__atomic_fetch_max(p, 1.0, memory_order_relaxed);
__atomic_fetch_min(p, 1.0, memory_order_relaxed);
@@ -76,18 +76,18 @@ __global__ void ffp3(long double *p) {
// CHECK: atomicrmw fmin ptr {{.*}} monotonic
// CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic
// CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic
- // SAFE: _Z4ffp3Pe
+ // SAFE-LABEL: @_Z4ffp3Pe
// SAFE: global_atomic_cmpswap_b64
// SAFE: global_atomic_cmpswap_b64
// SAFE: global_atomic_cmpswap_b64
// SAFE: global_atomic_cmpswap_b64
// SAFE: global_atomic_cmpswap_b64
- // UNSAFE: _Z4ffp3Pe
- // UNSAFE: global_atomic_cmpswap_x2
- // UNSAFE: global_atomic_cmpswap_x2
+ // UNSAFE-LABEL: @_Z4ffp3Pe
// UNSAFE: global_atomic_cmpswap_x2
// UNSAFE: global_atomic_cmpswap_x2
// UNSAFE: global_atomic_cmpswap_x2
+ // UNSAFE: global_atomic_max_f64
+ // UNSAFE: global_atomic_min_f64
__atomic_fetch_sub(p, 1.0L, memory_order_relaxed);
__atomic_fetch_max(p, 1.0L, memory_order_relaxed);
__atomic_fetch_min(p, 1.0L, memory_order_relaxed);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index cb5ceb9..d6a2395 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -351,6 +351,7 @@ def FeatureGFX90AInsts : SubtargetFeature<"gfx90a-insts",
"GFX90AInsts",
"true",
"Additional instructions for GFX90A+"
+ // [HasAtomicFMinFMaxF64GlobalInsts, HasAtomicFMinFMaxF64FlatInsts] // TODO
>;
def FeatureGFX940Insts : SubtargetFeature<"gfx940-insts",
@@ -711,6 +712,30 @@ def FeatureAtomicFaddRtnInsts : SubtargetFeature<"atomic-fadd-rtn-insts",
[FeatureFlatGlobalInsts]
>;
+def FeatureAtomicFMinFMaxF32GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-global-f32",
+ "HasAtomicFMinFMaxF32GlobalInsts",
+ "true",
+ "Has global/buffer instructions for atomicrmw fmin/fmax for float"
+>;
+
+def FeatureAtomicFMinFMaxF64GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-global-f64",
+ "HasAtomicFMinFMaxF64GlobalInsts",
+ "true",
+ "Has global/buffer instructions for atomicrmw fmin/fmax for float"
+>;
+
+def FeatureAtomicFMinFMaxF32FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f32",
+ "HasAtomicFMinFMaxF32FlatInsts",
+ "true",
+ "Has flat memory instructions for atomicrmw fmin/fmax for float"
+>;
+
+def FeatureAtomicFMinFMaxF64FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f64",
+ "HasAtomicFMinFMaxF64FlatInsts",
+ "true",
+ "Has flat memory instructions for atomicrmw fmin/fmax for double"
+>;
+
def FeatureAtomicFaddNoRtnInsts : SubtargetFeature<"atomic-fadd-no-rtn-insts",
"HasAtomicFaddNoRtnInsts",
"true",
@@ -1061,7 +1086,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts,
- FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
+ FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
+ FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts
]
>;
@@ -1072,7 +1098,9 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess,
- FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
+ FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
+ FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
+ FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts
]
>;
@@ -1127,7 +1155,9 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts,
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
- FeatureMaxHardClauseLength63
+ FeatureMaxHardClauseLength63,
+ FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
+ FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts
]
>;
@@ -1148,7 +1178,8 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
FeatureA16, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
FeatureGWS, FeatureDefaultComponentZero,
- FeatureMaxHardClauseLength32
+ FeatureMaxHardClauseLength32,
+ FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts
]
>;
@@ -1168,7 +1199,9 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
FeatureA16, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
- FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast
+ FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast,
+ FeatureMaxHardClauseLength32,
+ FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts
]
>;
@@ -1331,7 +1364,10 @@ def FeatureISAVersion9_0_A : FeatureSet<
FeaturePackedTID,
FullRate64Ops,
FeatureBackOffBarrier,
- FeatureKernargPreload])>;
+ FeatureKernargPreload,
+ FeatureAtomicFMinFMaxF64GlobalInsts,
+ FeatureAtomicFMinFMaxF64FlatInsts
+ ])>;
def FeatureISAVersion9_0_C : FeatureSet<
!listconcat(FeatureISAVersion9_0_Consumer_Common.Features,
@@ -1371,7 +1407,10 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureArchitectedFlatScratch,
FullRate64Ops,
FeatureBackOffBarrier,
- FeatureKernargPreload]>;
+ FeatureKernargPreload,
+ FeatureAtomicFMinFMaxF64GlobalInsts,
+ FeatureAtomicFMinFMaxF64FlatInsts
+ ]>;
def FeatureISAVersion9_4_0 : FeatureSet<
!listconcat(FeatureISAVersion9_4_Common.Features,
@@ -1862,11 +1901,28 @@ def isGFX12Plus :
def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
-def HasBufferFlatGlobalAtomicsF64 :
+
+def HasBufferFlatGlobalAtomicsF64 : // FIXME: Rename to show it's only for fadd
Predicate<"Subtarget->hasBufferFlatGlobalAtomicsF64()">,
// FIXME: This is too coarse, and working around using pseudo's predicates on real instruction.
AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX10Insts, FeatureSouthernIslands, FeatureSeaIslands)>;
+def HasAtomicFMinFMaxF32GlobalInsts :
+ Predicate<"Subtarget->hasAtomicFMinFMaxF32GlobalInsts()">,
+ AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF32GlobalInsts)>;
+
+def HasAtomicFMinFMaxF64GlobalInsts :
+ Predicate<"Subtarget->hasAtomicFMinFMaxF64GlobalInsts()">,
+ AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF64GlobalInsts)>;
+
+def HasAtomicFMinFMaxF32FlatInsts :
+ Predicate<"Subtarget->hasAtomicFMinFMaxF32FlatInsts()">,
+ AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF32FlatInsts)>;
+
+def HasAtomicFMinFMaxF64FlatInsts :
+ Predicate<"Subtarget->hasAtomicFMinFMaxF64FlatInsts()">,
+ AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF64FlatInsts)>;
+
def HasLdsAtomicAddF64 :
Predicate<"Subtarget->hasLdsAtomicAddF64()">,
AssemblerPredicate<(any_of FeatureGFX90AInsts)>;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index f95c939..cda4b57 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1149,21 +1149,21 @@ let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <
"buffer_atomic_fcmpswap", VReg_64, v2f32, null_frag
>;
+}
+
+let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in {
defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <
"buffer_atomic_fmin", VGPR_32, f32, null_frag
>;
defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <
"buffer_atomic_fmax", VGPR_32, f32, null_frag
>;
-
}
let SubtargetPredicate = isGFX6GFX7GFX10 in {
-
defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <
"buffer_atomic_fcmpswap_x2", VReg_128, v2f64, null_frag
>;
-
}
let SubtargetPredicate = HasD16LoadStore in {
@@ -1645,6 +1645,16 @@ defm : BufferAtomicPat<"atomic_load_udec_wrap_global", Ty, "BUFFER_ATOMIC_DEC" #
} // end foreach Ty
+let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in {
+defm : BufferAtomicPat<"atomic_load_fmin_global", f32, "BUFFER_ATOMIC_FMIN">;
+defm : BufferAtomicPat<"atomic_load_fmax_global", f32, "BUFFER_ATOMIC_FMAX">;
+}
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
+defm : BufferAtomicPat<"atomic_load_fmin_global", f64, "BUFFER_ATOMIC_MIN_F64">;
+defm : BufferAtomicPat<"atomic_load_fmax_global", f64, "BUFFER_ATOMIC_MAX_F64">;
+}
+
defm : BufferAtomicCmpSwapPat<i32, v2i32, "BUFFER_ATOMIC_CMPSWAP">;
defm : BufferAtomicCmpSwapPat<i64, v2i64, "BUFFER_ATOMIC_CMPSWAP_X2">;
@@ -1746,11 +1756,12 @@ let SubtargetPredicate = HasAtomicCSubNoRtnInsts in {
defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["noret"]>;
}
-let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
+let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in {
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f32, "BUFFER_ATOMIC_FMIN">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f32, "BUFFER_ATOMIC_FMAX">;
}
-let SubtargetPredicate = isGFX6GFX7GFX10 in {
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">;
}
@@ -1822,9 +1833,12 @@ let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in {
let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64">;
+} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">;
-} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
+} //End let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts
multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string Inst> {
foreach RtnMode = ["ret", "noret"] in {
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 8ecbd62..98054dd 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -752,19 +752,29 @@ defm FLAT_ATOMIC_DEC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_dec_x2",
// GFX7-, GFX10-only flat instructions.
let SubtargetPredicate = isGFX7GFX10 in {
-
defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap_x2",
VReg_64, f64, v2f64, VReg_128>;
-
} // End SubtargetPredicate = isGFX7GFX10
+
+// The names may be flat_atomic_fmin_x2 on some subtargets, but we
+// choose this as the canonical name.
+let SubtargetPredicate = HasAtomicFMinFMaxF64FlatInsts in {
+defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo <"flat_atomic_min_f64",
+ VReg_64, f64>;
+
+defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo <"flat_atomic_max_f64",
+ VReg_64, f64>;
+}
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
+defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64>;
+defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>;
+}
+
let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in {
defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64>;
- defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64>;
- defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64>;
defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64>;
- defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64>;
- defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>;
} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64
let SubtargetPredicate = HasAtomicFlatPkAdd16Insts in {
@@ -1415,6 +1425,17 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_OR_X2", "atomic_load_or_"#as, i64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_SWAP_X2", "atomic_swap_"#as, i64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_"#as, i64, v2i64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_XOR_X2", "atomic_load_xor_"#as, i64>;
+
+let SubtargetPredicate = HasAtomicFMinFMaxF32FlatInsts in {
+defm : FlatAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_"#as, f32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_"#as, f32>;
+}
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64FlatInsts in {
+defm : FlatAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_"#as, f64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
+}
+
} // end foreach as
let SubtargetPredicate = isGFX12Plus in {
@@ -1576,33 +1597,22 @@ let OtherPredicates = [isGFX12Plus] in {
}
}
-let OtherPredicates = [isGFX10Plus] in {
+let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
-defm : FlatAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>;
-defm : FlatAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>;
-}
-
-let OtherPredicates = [isGFX10GFX11] in {
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>;
+}
+let SubtargetPredicate = HasAtomicFMinFMaxF32FlatInsts in {
+defm : FlatAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>;
defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin", f32>;
defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax", f32>;
}
-let OtherPredicates = [isGFX10Only] in {
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>;
-defm : FlatAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_flat", f64>;
-defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_flat", f64>;
-defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MIN_F64", "int_amdgcn_flat_atomic_fmin", f64>;
-defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>;
-}
-
let OtherPredicates = [isGFX12Only] in {
+ // FIXME: Remove these intrinsics
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>;
defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>;
@@ -1632,22 +1642,26 @@ defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_am
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
}
-let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in {
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>;
+let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>;
-defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f64>;
-defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", "global_addrspace", f64>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>;
-defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>;
-defm : FlatAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_flat", f64>;
-defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_flat", f64>;
-defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", f64>;
+}
+
+let SubtargetPredicate = HasAtomicFMinFMaxF64FlatInsts in {
defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MIN_F64", "int_amdgcn_flat_atomic_fmin", f64>;
defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>;
}
+let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in {
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>;
+defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f64>;
+defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", "global_addrspace", f64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>;
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", f64>;
+}
+
let OtherPredicates = [HasFlatAtomicFaddF32Inst] in {
defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>;
defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index db5b467..74c8f85 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -159,6 +159,10 @@ protected:
bool HasFP8Insts = false;
bool HasFP8ConversionInsts = false;
bool HasPkFmacF16Inst = false;
+ bool HasAtomicFMinFMaxF32GlobalInsts = false;
+ bool HasAtomicFMinFMaxF64GlobalInsts = false;
+ bool HasAtomicFMinFMaxF32FlatInsts = false;
+ bool HasAtomicFMinFMaxF64FlatInsts = false;
bool HasAtomicDsPkAdd16Insts = false;
bool HasAtomicFlatPkAdd16Insts = false;
bool HasAtomicFaddRtnInsts = false;
@@ -820,6 +824,22 @@ public:
return HasPkFmacF16Inst;
}
+ bool hasAtomicFMinFMaxF32GlobalInsts() const {
+ return HasAtomicFMinFMaxF32GlobalInsts;
+ }
+
+ bool hasAtomicFMinFMaxF64GlobalInsts() const {
+ return HasAtomicFMinFMaxF64GlobalInsts;
+ }
+
+ bool hasAtomicFMinFMaxF32FlatInsts() const {
+ return HasAtomicFMinFMaxF32FlatInsts;
+ }
+
+ bool hasAtomicFMinFMaxF64FlatInsts() const {
+ return HasAtomicFMinFMaxF64FlatInsts;
+ }
+
bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5921422..d40fbe6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16082,6 +16082,35 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
return AtomicExpansionKind::None;
+ if (unsafeFPAtomicsDisabled(RMW->getFunction()))
+ return AtomicExpansionKind::CmpXChg;
+
+ // Always expand system scope fp atomics.
+ if (HasSystemScope)
+ return AtomicExpansionKind::CmpXChg;
+
+ // For flat and global cases:
+ // float, double in gfx7. Manual claims denormal support.
+ // Removed in gfx8.
+ // float, double restored in gfx10.
+ // double removed again in gfx11, so only f32 for gfx11/gfx12.
+ //
+ // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no
+ // f32.
+ //
+ // FIXME: Check scope and fine grained memory
+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
+ if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
+ if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ }
+
return AtomicExpansionKind::CmpXChg;
}
case AtomicRMWInst::Min:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index 00d418c..a05c05f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -20,28 +20,10 @@ define float @flat_agent_atomic_fmax_ret_f32(ptr %ptr, float %val) #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB0_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32:
@@ -72,55 +54,21 @@ define float @flat_agent_atomic_fmax_ret_f32(ptr %ptr, float %val) #0 {
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB0_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB0_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32:
@@ -198,25 +146,9 @@ define float @flat_agent_atomic_fmax_ret_f32(ptr %ptr, float %val) #0 {
; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v4
-; GFX7-NEXT: v_max_f32_e32 v3, v3, v2
-; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB0_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst
ret float %result
@@ -230,28 +162,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB1_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos:
@@ -282,56 +196,23 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB1_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_max_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB1_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos:
@@ -410,26 +291,11 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_max_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB1_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
@@ -444,28 +310,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB2_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg:
@@ -503,61 +351,25 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX11-NEXT: v_max_f32_e32 v5, v0, v1
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB2_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_max_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB2_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg:
@@ -642,26 +454,11 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_max_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB2_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -512
%result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
@@ -676,27 +473,10 @@ define void @flat_agent_atomic_fmax_noret_f32(ptr %ptr, float %val) #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB3_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32:
@@ -726,53 +506,23 @@ define void @flat_agent_atomic_fmax_noret_f32(ptr %ptr, float %val) #0 {
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_max_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB3_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB3_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32:
@@ -847,24 +597,9 @@ define void @flat_agent_atomic_fmax_noret_f32(ptr %ptr, float %val) #0 {
; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmax v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB3_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst
ret void
@@ -878,27 +613,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB4_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos:
@@ -928,28 +646,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_max_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB4_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos:
@@ -957,26 +659,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB4_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos:
@@ -1055,24 +743,9 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmax v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB4_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
@@ -1087,27 +760,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:-2048
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB5_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg:
@@ -1143,32 +799,14 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v3, v[3:4]
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_max_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB5_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg:
@@ -1176,26 +814,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB5_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg:
@@ -1282,24 +906,9 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmax v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB5_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -512
%unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
@@ -1745,28 +1354,10 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB8_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__ftz:
@@ -1797,55 +1388,21 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB8_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB8_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__ftz:
@@ -1923,25 +1480,9 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__ftz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v4
-; GFX7-NEXT: v_max_f32_e32 v3, v3, v2
-; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB8_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst
ret float %result
@@ -1955,28 +1496,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr %ptr, float
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB9_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
@@ -2007,56 +1530,23 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr %ptr, float
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB9_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_max_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB9_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
@@ -2135,26 +1625,11 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr %ptr, float
; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_max_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB9_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
@@ -2169,28 +1644,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB10_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
@@ -2228,61 +1685,25 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX11-NEXT: v_max_f32_e32 v5, v0, v1
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB10_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_max_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB10_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
@@ -2367,26 +1788,11 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_max_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB10_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -512
%result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
@@ -2401,27 +1807,10 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB11_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__ftz:
@@ -2451,53 +1840,23 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_max_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB11_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB11_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__ftz:
@@ -2572,24 +1931,9 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__ftz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmax v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB11_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst
ret void
@@ -2603,27 +1947,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB12_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
@@ -2653,28 +1980,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_max_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB12_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
@@ -2682,26 +1993,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB12_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
@@ -2780,24 +2077,9 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmax v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB12_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
@@ -2812,27 +2094,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:-2048
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB13_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
@@ -2868,32 +2133,14 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v3, v[3:4]
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_max_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB13_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
@@ -2901,26 +2148,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB13_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
@@ -3007,24 +2240,9 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmax v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB13_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -512
%unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst
@@ -3497,27 +2715,10 @@ define double @flat_agent_atomic_fmax_ret_f64(ptr %ptr, double %val) #0 {
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0
+; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB16_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64:
@@ -3551,54 +2752,19 @@ define double @flat_agent_atomic_fmax_ret_f64(ptr %ptr, double %val) #0 {
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB16_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB16_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64:
@@ -3659,30 +2825,9 @@ define double @flat_agent_atomic_fmax_ret_f64(ptr %ptr, double %val) #0 {
; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: flat_load_dword v5, v[5:6]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX7-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB16_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v4
-; GFX7-NEXT: v_mov_b32_e32 v1, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst
ret double %result
@@ -3723,27 +2868,10 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos(ptr %ptr, double %v
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 sc0
+; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] offset:2040 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB17_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos:
@@ -3777,54 +2905,21 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos(ptr %ptr, double %v
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
-; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v9, v1
-; GFX10-NEXT: v_mov_b32_e32 v8, v0
-; GFX10-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX10-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB17_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] offset:2040 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos:
@@ -3885,30 +2980,11 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos(ptr %ptr, double %v
; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7f8, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
-; GFX7-NEXT: flat_load_dword v0, v[4:5]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v9, v1
-; GFX7-NEXT: v_mov_b32_e32 v8, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX7-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB17_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr %ptr, i64 255
%result = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst
@@ -3950,33 +3026,13 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg(ptr %ptr, double %v
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v4, v0
-; GFX940-NEXT: v_mov_b32_e32 v5, v1
-; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
-; GFX940-NEXT: s_movk_i32 s0, 0xf800
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
-; GFX940-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX940-NEXT: s_mov_b32 s1, -1
-; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[0:1]
-; GFX940-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX940-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0
+; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB18_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg:
@@ -4015,56 +3071,23 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg(ptr %ptr, double %v
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
-; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v9, v1
-; GFX10-NEXT: v_mov_b32_e32 v8, v0
-; GFX10-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX10-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB18_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX90A-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg:
@@ -4129,30 +3152,11 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg(ptr %ptr, double %v
; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff804, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
-; GFX7-NEXT: flat_load_dword v0, v[4:5]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v9, v1
-; GFX7-NEXT: v_mov_b32_e32 v8, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX7-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB18_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr %ptr, i64 -256
%result = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst
@@ -4193,25 +3197,10 @@ define void @flat_agent_atomic_fmax_noret_f64(ptr %ptr, double %val) #0 {
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0
+; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB19_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64:
@@ -4244,50 +3233,20 @@ define void @flat_agent_atomic_fmax_noret_f64(ptr %ptr, double %val) #0 {
; GFX10-LABEL: flat_agent_atomic_fmax_noret_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB19_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB19_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64:
@@ -4344,28 +3303,9 @@ define void @flat_agent_atomic_fmax_noret_f64(ptr %ptr, double %val) #0 {
; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: flat_load_dword v5, v[5:6]
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB19_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst
ret void
@@ -4405,25 +3345,10 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos(ptr %ptr, double %v
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] offset:2040 sc0
+; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] offset:2040
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB20_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos:
@@ -4458,50 +3383,20 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos(ptr %ptr, double %v
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB20_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] offset:2040 glc
+; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] offset:2040
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB20_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos:
@@ -4560,30 +3455,11 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos(ptr %ptr, double %v
; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0x7f8, v0
-; GFX7-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: flat_load_dword v4, v[6:7]
-; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[0:1]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB20_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr %ptr, i64 255
%unused = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst
@@ -4624,31 +3500,13 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX940-NEXT: s_movk_i32 s0, 0xf800
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
-; GFX940-NEXT: s_mov_b32 s1, -1
-; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0
+; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB21_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg:
@@ -4687,54 +3545,22 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB21_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[0:1]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg:
@@ -4798,30 +3624,11 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff804, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: flat_load_dword v4, v[6:7]
-; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[0:1]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB21_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr %ptr, i64 -256
%unused = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index fdfbb42..216ea07 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -20,28 +20,10 @@ define float @flat_agent_atomic_fmin_ret_f32(ptr %ptr, float %val) #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB0_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32:
@@ -72,55 +54,21 @@ define float @flat_agent_atomic_fmin_ret_f32(ptr %ptr, float %val) #0 {
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB0_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB0_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32:
@@ -198,25 +146,9 @@ define float @flat_agent_atomic_fmin_ret_f32(ptr %ptr, float %val) #0 {
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v4
-; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB0_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst
ret float %result
@@ -230,28 +162,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB1_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos:
@@ -282,56 +196,23 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB1_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_min_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB1_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos:
@@ -410,26 +291,11 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_min_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB1_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -444,28 +310,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB2_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg:
@@ -503,61 +351,25 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX11-NEXT: v_min_f32_e32 v5, v0, v1
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB2_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_min_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB2_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg:
@@ -642,26 +454,11 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_min_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB2_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -512
%result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -676,27 +473,10 @@ define void @flat_agent_atomic_fmin_noret_f32(ptr %ptr, float %val) #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB3_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32:
@@ -726,53 +506,23 @@ define void @flat_agent_atomic_fmin_noret_f32(ptr %ptr, float %val) #0 {
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB3_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB3_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32:
@@ -847,24 +597,9 @@ define void @flat_agent_atomic_fmin_noret_f32(ptr %ptr, float %val) #0 {
; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmin v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB3_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst
ret void
@@ -878,27 +613,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB4_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos:
@@ -928,28 +646,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB4_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos:
@@ -957,26 +659,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB4_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos:
@@ -1055,24 +743,9 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmin v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB4_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -1087,27 +760,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB5_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg:
@@ -1143,32 +799,14 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v3, v[3:4]
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB5_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg:
@@ -1176,26 +814,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB5_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg:
@@ -1282,24 +906,9 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmin v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB5_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -512
%unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -1745,28 +1354,10 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB8_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__ftz:
@@ -1797,55 +1388,21 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB8_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB8_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__ftz:
@@ -1923,25 +1480,9 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__ftz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v4
-; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB8_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst
ret float %result
@@ -1955,28 +1496,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr %ptr, float
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB9_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
@@ -2007,56 +1530,23 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr %ptr, float
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB9_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_min_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB9_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
@@ -2135,26 +1625,11 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr %ptr, float
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_min_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB9_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -2169,28 +1644,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB10_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
@@ -2228,61 +1685,25 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX11-NEXT: v_min_f32_e32 v5, v0, v1
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB10_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_min_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB10_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
@@ -2367,26 +1788,11 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_min_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB10_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -512
%result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -2401,27 +1807,10 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB11_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__ftz:
@@ -2451,53 +1840,23 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB11_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB11_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__ftz:
@@ -2572,24 +1931,9 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__ftz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmin v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB11_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst
ret void
@@ -2603,27 +1947,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB12_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
@@ -2653,28 +1980,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB12_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
@@ -2682,26 +1993,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB12_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
@@ -2780,24 +2077,9 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmin v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB12_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -2812,27 +2094,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB13_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
@@ -2868,32 +2133,14 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v3, v[3:4]
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB13_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
@@ -2901,26 +2148,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB13_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
@@ -3007,24 +2240,9 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmin v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB13_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -512
%unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst
@@ -3497,27 +2715,10 @@ define double @flat_agent_atomic_fmin_ret_f64(ptr %ptr, double %val) #0 {
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0
+; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB16_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64:
@@ -3551,54 +2752,19 @@ define double @flat_agent_atomic_fmin_ret_f64(ptr %ptr, double %val) #0 {
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB16_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB16_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64:
@@ -3659,30 +2825,9 @@ define double @flat_agent_atomic_fmin_ret_f64(ptr %ptr, double %val) #0 {
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: flat_load_dword v5, v[5:6]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX7-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB16_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v4
-; GFX7-NEXT: v_mov_b32_e32 v1, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst
ret double %result
@@ -3723,27 +2868,10 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos(ptr %ptr, double %v
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 sc0
+; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] offset:2040 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB17_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
@@ -3777,54 +2905,21 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos(ptr %ptr, double %v
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
-; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v9, v1
-; GFX10-NEXT: v_mov_b32_e32 v8, v0
-; GFX10-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX10-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB17_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] offset:2040 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
@@ -3885,30 +2980,11 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos(ptr %ptr, double %v
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7f8, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
-; GFX7-NEXT: flat_load_dword v0, v[4:5]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v9, v1
-; GFX7-NEXT: v_mov_b32_e32 v8, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX7-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB17_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr %ptr, i64 255
%result = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst
@@ -3950,33 +3026,13 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg(ptr %ptr, double %v
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v4, v0
-; GFX940-NEXT: v_mov_b32_e32 v5, v1
-; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
-; GFX940-NEXT: s_movk_i32 s0, 0xf800
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
-; GFX940-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX940-NEXT: s_mov_b32 s1, -1
-; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[0:1]
-; GFX940-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX940-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0
+; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB18_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
@@ -4015,56 +3071,23 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg(ptr %ptr, double %v
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
-; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v9, v1
-; GFX10-NEXT: v_mov_b32_e32 v8, v0
-; GFX10-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX10-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB18_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX90A-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
@@ -4129,30 +3152,11 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg(ptr %ptr, double %v
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff804, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
-; GFX7-NEXT: flat_load_dword v0, v[4:5]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v9, v1
-; GFX7-NEXT: v_mov_b32_e32 v8, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX7-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB18_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr %ptr, i64 -256
%result = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst
@@ -4193,25 +3197,10 @@ define void @flat_agent_atomic_fmin_noret_f64(ptr %ptr, double %val) #0 {
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0
+; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB19_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64:
@@ -4244,50 +3233,20 @@ define void @flat_agent_atomic_fmin_noret_f64(ptr %ptr, double %val) #0 {
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB19_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB19_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64:
@@ -4344,28 +3303,9 @@ define void @flat_agent_atomic_fmin_noret_f64(ptr %ptr, double %val) #0 {
; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: flat_load_dword v5, v[5:6]
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB19_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst
ret void
@@ -4405,25 +3345,10 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos(ptr %ptr, double %v
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] offset:2040 sc0
+; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] offset:2040
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB20_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos:
@@ -4458,50 +3383,20 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos(ptr %ptr, double %v
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB20_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] offset:2040 glc
+; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] offset:2040
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB20_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos:
@@ -4560,30 +3455,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos(ptr %ptr, double %v
; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0x7f8, v0
-; GFX7-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: flat_load_dword v4, v[6:7]
-; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[0:1]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB20_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr %ptr, i64 255
%unused = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst
@@ -4624,31 +3500,13 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX940-NEXT: s_movk_i32 s0, 0xf800
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
-; GFX940-NEXT: s_mov_b32 s1, -1
-; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0
+; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB21_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg:
@@ -4687,54 +3545,22 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB21_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[0:1]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg:
@@ -4798,30 +3624,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff804, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: flat_load_dword v4, v[6:7]
-; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[0:1]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
+; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB21_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr %ptr, i64 -256
%unused = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index 492c74e..ae5dca4 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -21,28 +21,10 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB0_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32:
@@ -73,55 +55,21 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX11-LABEL: global_agent_atomic_fmax_ret_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB0_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_ret_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB0_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32:
@@ -203,27 +151,10 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT: v_max_f32_e32 v4, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB0_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f32:
@@ -233,28 +164,10 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT: v_max_f32_e32 v4, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB0_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
@@ -269,28 +182,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB1_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
@@ -321,55 +216,21 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB1_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB1_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
@@ -452,27 +313,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT: v_max_f32_e32 v4, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB1_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
@@ -482,28 +326,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT: v_max_f32_e32 v4, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB1_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -519,28 +345,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB2_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
@@ -571,55 +379,21 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB2_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB2_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
@@ -699,71 +473,26 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_max_f32_e32 v5, v0, v2
-; GFX7-NEXT: v_mov_b32_e32 v0, v5
-; GFX7-NEXT: v_mov_b32_e32 v1, v6
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB2_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX6-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX6-NEXT: v_max_f32_e32 v5, v0, v2
-; GFX6-NEXT: v_mov_b32_e32 v0, v5
-; GFX6-NEXT: v_mov_b32_e32 v1, v6
-; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB2_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
@@ -779,27 +508,10 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB3_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32:
@@ -829,53 +541,21 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX11-LABEL: global_agent_atomic_fmax_noret_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB3_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_noret_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB3_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32:
@@ -954,26 +634,9 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB3_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_noret_f32:
@@ -983,27 +646,9 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB3_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
@@ -1018,27 +663,10 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB4_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
@@ -1068,53 +696,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB4_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:2044
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB4_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
@@ -1195,26 +791,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB4_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
@@ -1224,27 +803,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB4_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -1260,27 +821,10 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB5_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
@@ -1310,53 +854,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:-2048
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB5_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:-2048
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB5_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
@@ -1434,67 +946,24 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB5_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB5_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
@@ -2010,28 +1479,10 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB8_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
@@ -2062,55 +1513,21 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB8_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB8_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
@@ -2192,27 +1609,10 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT: v_max_f32_e32 v4, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB8_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
@@ -2222,28 +1622,10 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT: v_max_f32_e32 v4, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB8_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
@@ -2258,28 +1640,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB9_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
@@ -2310,55 +1674,21 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB9_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB9_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
@@ -2441,27 +1771,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT: v_max_f32_e32 v4, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB9_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
@@ -2471,28 +1784,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT: v_max_f32_e32 v4, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB9_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -2508,28 +1803,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB10_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
@@ -2560,55 +1837,21 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB10_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB10_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
@@ -2688,71 +1931,26 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_max_f32_e32 v5, v0, v2
-; GFX7-NEXT: v_mov_b32_e32 v0, v5
-; GFX7-NEXT: v_mov_b32_e32 v1, v6
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB10_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX6-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX6-NEXT: v_max_f32_e32 v5, v0, v2
-; GFX6-NEXT: v_mov_b32_e32 v0, v5
-; GFX6-NEXT: v_mov_b32_e32 v1, v6
-; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB10_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
@@ -2768,27 +1966,10 @@ define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB11_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
@@ -2818,53 +1999,21 @@ define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB11_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB11_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
@@ -2943,26 +2092,9 @@ define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB11_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
@@ -2972,27 +2104,9 @@ define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB11_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
@@ -3007,27 +2121,10 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB12_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
@@ -3057,53 +2154,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB12_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:2044
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB12_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
@@ -3184,26 +2249,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB12_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
@@ -3213,27 +2261,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB12_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -3249,27 +2279,10 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB13_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
@@ -3299,53 +2312,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:-2048
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB13_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:-2048
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB13_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
@@ -3423,67 +2404,24 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB13_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB13_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
@@ -4026,27 +2964,10 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v
; GFX940-LABEL: global_agent_atomic_fmax_ret_f64:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0
+; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB16_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmax_ret_f64:
@@ -4080,54 +3001,19 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v
; GFX10-LABEL: global_agent_atomic_fmax_ret_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB16_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB16_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_ret_f64:
@@ -4186,69 +3072,28 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v11, v1
-; GFX7-NEXT: v_mov_b32_e32 v10, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v0, v8
-; GFX7-NEXT: v_mov_b32_e32 v1, v9
-; GFX7-NEXT: v_mov_b32_e32 v2, v10
-; GFX7-NEXT: v_mov_b32_e32 v3, v11
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB16_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v1
-; GFX6-NEXT: v_mov_b32_e32 v10, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v0, v8
-; GFX6-NEXT: v_mov_b32_e32 v1, v9
-; GFX6-NEXT: v_mov_b32_e32 v2, v10
-; GFX6-NEXT: v_mov_b32_e32 v3, v11
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB16_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
+; GFX6-NEXT: v_mov_b32_e32 v1, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
@@ -4290,27 +3135,10 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos(ptr addrspace(1)
; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 sc0
+; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB17_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
@@ -4344,54 +3172,19 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos(ptr addrspace(1)
; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB17_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
@@ -4450,69 +3243,28 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos(ptr addrspace(1)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:2040
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v11, v1
-; GFX7-NEXT: v_mov_b32_e32 v10, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v0, v8
-; GFX7-NEXT: v_mov_b32_e32 v1, v9
-; GFX7-NEXT: v_mov_b32_e32 v2, v10
-; GFX7-NEXT: v_mov_b32_e32 v3, v11
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB17_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:2040
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v1
-; GFX6-NEXT: v_mov_b32_e32 v10, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v0, v8
-; GFX6-NEXT: v_mov_b32_e32 v1, v9
-; GFX6-NEXT: v_mov_b32_e32 v2, v10
-; GFX6-NEXT: v_mov_b32_e32 v3, v11
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB17_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
+; GFX6-NEXT: v_mov_b32_e32 v1, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
@@ -4555,27 +3307,10 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg(ptr addrspace(1)
; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB18_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
@@ -4609,54 +3344,19 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg(ptr addrspace(1)
; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB18_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
@@ -4715,77 +3415,28 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg(ptr addrspace(1)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v4
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v5, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v11, v1
-; GFX7-NEXT: v_mov_b32_e32 v10, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v0, v8
-; GFX7-NEXT: v_mov_b32_e32 v1, v9
-; GFX7-NEXT: v_mov_b32_e32 v2, v10
-; GFX7-NEXT: v_mov_b32_e32 v3, v11
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB18_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v4
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v5, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v1
-; GFX6-NEXT: v_mov_b32_e32 v10, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v0, v8
-; GFX6-NEXT: v_mov_b32_e32 v1, v9
-; GFX6-NEXT: v_mov_b32_e32 v2, v10
-; GFX6-NEXT: v_mov_b32_e32 v3, v11
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB18_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
+; GFX6-NEXT: v_mov_b32_e32 v1, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
@@ -4827,25 +3478,10 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX940-LABEL: global_agent_atomic_fmax_noret_f64:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0
+; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB19_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmax_noret_f64:
@@ -4878,50 +3514,19 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX10-LABEL: global_agent_atomic_fmax_noret_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB19_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB19_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_noret_f64:
@@ -4979,29 +3584,9 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v11, v5
-; GFX7-NEXT: v_mov_b32_e32 v10, v4
-; GFX7-NEXT: v_mov_b32_e32 v9, v3
-; GFX7-NEXT: v_mov_b32_e32 v8, v2
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v8
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v5, v9
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB19_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_noret_f64:
@@ -5011,30 +3596,9 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX6-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v5
-; GFX6-NEXT: v_mov_b32_e32 v10, v4
-; GFX6-NEXT: v_mov_b32_e32 v9, v3
-; GFX6-NEXT: v_mov_b32_e32 v8, v2
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v4, v8
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v5, v9
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB19_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
@@ -5075,25 +3639,10 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 sc0
+; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:2040
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB20_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
@@ -5126,50 +3675,19 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off offset:2040
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB20_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:2040
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB20_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
@@ -5229,29 +3747,9 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v11, v5
-; GFX7-NEXT: v_mov_b32_e32 v10, v4
-; GFX7-NEXT: v_mov_b32_e32 v9, v3
-; GFX7-NEXT: v_mov_b32_e32 v8, v2
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v8
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v5, v9
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB20_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
@@ -5261,30 +3759,9 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX6-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v5
-; GFX6-NEXT: v_mov_b32_e32 v10, v4
-; GFX6-NEXT: v_mov_b32_e32 v9, v3
-; GFX6-NEXT: v_mov_b32_e32 v8, v2
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v4, v8
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v5, v9
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB20_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
@@ -5326,25 +3803,10 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg(ptr addrspace(1)
; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB21_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
@@ -5377,50 +3839,19 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg(ptr addrspace(1)
; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off offset:-2048
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB21_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
+; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
@@ -5477,73 +3908,24 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg(ptr addrspace(1)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v11, v5
-; GFX7-NEXT: v_mov_b32_e32 v10, v4
-; GFX7-NEXT: v_mov_b32_e32 v9, v3
-; GFX7-NEXT: v_mov_b32_e32 v8, v2
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v8
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v5, v9
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB21_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX6-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v5
-; GFX6-NEXT: v_mov_b32_e32 v10, v4
-; GFX6-NEXT: v_mov_b32_e32 v9, v3
-; GFX6-NEXT: v_mov_b32_e32 v8, v2
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v4, v8
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v5, v9
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB21_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index a4adb08..915ce74 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -21,28 +21,10 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB0_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32:
@@ -73,55 +55,21 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX11-LABEL: global_agent_atomic_fmin_ret_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB0_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_ret_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB0_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32:
@@ -203,27 +151,10 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT: v_min_f32_e32 v4, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB0_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f32:
@@ -233,28 +164,10 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT: v_min_f32_e32 v4, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB0_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
@@ -269,28 +182,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB1_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
@@ -321,55 +216,21 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB1_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB1_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
@@ -452,27 +313,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT: v_min_f32_e32 v4, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB1_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
@@ -482,28 +326,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT: v_min_f32_e32 v4, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB1_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -519,28 +345,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB2_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
@@ -571,55 +379,21 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB2_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB2_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
@@ -699,71 +473,26 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_min_f32_e32 v5, v0, v2
-; GFX7-NEXT: v_mov_b32_e32 v0, v5
-; GFX7-NEXT: v_mov_b32_e32 v1, v6
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB2_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX6-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX6-NEXT: v_min_f32_e32 v5, v0, v2
-; GFX6-NEXT: v_mov_b32_e32 v0, v5
-; GFX6-NEXT: v_mov_b32_e32 v1, v6
-; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB2_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
@@ -779,27 +508,10 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB3_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32:
@@ -829,53 +541,21 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX11-LABEL: global_agent_atomic_fmin_noret_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB3_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_noret_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB3_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32:
@@ -954,26 +634,9 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB3_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_noret_f32:
@@ -983,27 +646,9 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB3_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
@@ -1018,27 +663,10 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB4_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
@@ -1068,53 +696,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB4_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:2044
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB4_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
@@ -1195,26 +791,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB4_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
@@ -1224,27 +803,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB4_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -1260,27 +821,10 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB5_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
@@ -1310,53 +854,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:-2048
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB5_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:-2048
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB5_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
@@ -1434,67 +946,24 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB5_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB5_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
@@ -2010,28 +1479,10 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB8_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
@@ -2062,55 +1513,21 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB8_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB8_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
@@ -2192,27 +1609,10 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT: v_min_f32_e32 v4, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB8_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
@@ -2222,28 +1622,10 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT: v_min_f32_e32 v4, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB8_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
@@ -2258,28 +1640,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB9_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
@@ -2310,55 +1674,21 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB9_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB9_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
@@ -2441,27 +1771,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT: v_min_f32_e32 v4, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB9_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
@@ -2471,28 +1784,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT: v_min_f32_e32 v4, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB9_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -2508,28 +1803,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB10_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
@@ -2560,55 +1837,21 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB10_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB10_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
@@ -2688,71 +1931,26 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_min_f32_e32 v5, v0, v2
-; GFX7-NEXT: v_mov_b32_e32 v0, v5
-; GFX7-NEXT: v_mov_b32_e32 v1, v6
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB10_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX6-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX6-NEXT: v_min_f32_e32 v5, v0, v2
-; GFX6-NEXT: v_mov_b32_e32 v0, v5
-; GFX6-NEXT: v_mov_b32_e32 v1, v6
-; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB10_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
@@ -2768,27 +1966,10 @@ define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB11_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
@@ -2818,53 +1999,21 @@ define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB11_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB11_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
@@ -2943,26 +2092,9 @@ define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB11_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
@@ -2972,27 +2104,9 @@ define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB11_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
@@ -3007,27 +2121,10 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB12_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
@@ -3057,53 +2154,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB12_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:2044
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB12_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
@@ -3184,26 +2249,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB12_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
@@ -3213,27 +2261,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB12_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -3249,27 +2279,10 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB13_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
@@ -3299,53 +2312,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:-2048
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB13_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:-2048
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB13_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
@@ -3423,67 +2404,24 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB13_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB13_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
@@ -4026,27 +2964,10 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v
; GFX940-LABEL: global_agent_atomic_fmin_ret_f64:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0
+; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB16_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmin_ret_f64:
@@ -4080,54 +3001,19 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v
; GFX10-LABEL: global_agent_atomic_fmin_ret_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB16_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB16_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_ret_f64:
@@ -4186,69 +3072,28 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v11, v1
-; GFX7-NEXT: v_mov_b32_e32 v10, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v0, v8
-; GFX7-NEXT: v_mov_b32_e32 v1, v9
-; GFX7-NEXT: v_mov_b32_e32 v2, v10
-; GFX7-NEXT: v_mov_b32_e32 v3, v11
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB16_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v1
-; GFX6-NEXT: v_mov_b32_e32 v10, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v0, v8
-; GFX6-NEXT: v_mov_b32_e32 v1, v9
-; GFX6-NEXT: v_mov_b32_e32 v2, v10
-; GFX6-NEXT: v_mov_b32_e32 v3, v11
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB16_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
+; GFX6-NEXT: v_mov_b32_e32 v1, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
@@ -4290,27 +3135,10 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos(ptr addrspace(1)
; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 sc0
+; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB17_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
@@ -4344,54 +3172,19 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos(ptr addrspace(1)
; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB17_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
@@ -4450,69 +3243,28 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos(ptr addrspace(1)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:2040
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v11, v1
-; GFX7-NEXT: v_mov_b32_e32 v10, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v0, v8
-; GFX7-NEXT: v_mov_b32_e32 v1, v9
-; GFX7-NEXT: v_mov_b32_e32 v2, v10
-; GFX7-NEXT: v_mov_b32_e32 v3, v11
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB17_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:2040
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v1
-; GFX6-NEXT: v_mov_b32_e32 v10, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v0, v8
-; GFX6-NEXT: v_mov_b32_e32 v1, v9
-; GFX6-NEXT: v_mov_b32_e32 v2, v10
-; GFX6-NEXT: v_mov_b32_e32 v3, v11
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB17_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
+; GFX6-NEXT: v_mov_b32_e32 v1, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
@@ -4555,27 +3307,10 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg(ptr addrspace(1)
; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB18_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
@@ -4609,54 +3344,19 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg(ptr addrspace(1)
; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB18_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
@@ -4715,77 +3415,28 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg(ptr addrspace(1)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v4
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v5, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v11, v1
-; GFX7-NEXT: v_mov_b32_e32 v10, v0
-; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v0, v8
-; GFX7-NEXT: v_mov_b32_e32 v1, v9
-; GFX7-NEXT: v_mov_b32_e32 v2, v10
-; GFX7-NEXT: v_mov_b32_e32 v3, v11
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB18_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v4
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v5, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v1
-; GFX6-NEXT: v_mov_b32_e32 v10, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v0, v8
-; GFX6-NEXT: v_mov_b32_e32 v1, v9
-; GFX6-NEXT: v_mov_b32_e32 v2, v10
-; GFX6-NEXT: v_mov_b32_e32 v3, v11
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB18_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
+; GFX6-NEXT: v_mov_b32_e32 v1, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
@@ -4827,25 +3478,10 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX940-LABEL: global_agent_atomic_fmin_noret_f64:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0
+; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB19_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmin_noret_f64:
@@ -4878,50 +3514,19 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX10-LABEL: global_agent_atomic_fmin_noret_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB19_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB19_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_noret_f64:
@@ -4979,29 +3584,9 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v11, v5
-; GFX7-NEXT: v_mov_b32_e32 v10, v4
-; GFX7-NEXT: v_mov_b32_e32 v9, v3
-; GFX7-NEXT: v_mov_b32_e32 v8, v2
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v8
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v5, v9
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB19_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_noret_f64:
@@ -5011,30 +3596,9 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v5
-; GFX6-NEXT: v_mov_b32_e32 v10, v4
-; GFX6-NEXT: v_mov_b32_e32 v9, v3
-; GFX6-NEXT: v_mov_b32_e32 v8, v2
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v4, v8
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v5, v9
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB19_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%unused = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
@@ -5075,25 +3639,10 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 sc0
+; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:2040
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB20_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
@@ -5126,50 +3675,19 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off offset:2040
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB20_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:2040
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB20_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
@@ -5229,29 +3747,9 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v11, v5
-; GFX7-NEXT: v_mov_b32_e32 v10, v4
-; GFX7-NEXT: v_mov_b32_e32 v9, v3
-; GFX7-NEXT: v_mov_b32_e32 v8, v2
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v8
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v5, v9
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB20_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
@@ -5261,30 +3759,9 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v5
-; GFX6-NEXT: v_mov_b32_e32 v10, v4
-; GFX6-NEXT: v_mov_b32_e32 v9, v3
-; GFX6-NEXT: v_mov_b32_e32 v8, v2
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v4, v8
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v5, v9
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB20_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
@@ -5326,25 +3803,10 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg(ptr addrspace(1)
; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB21_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
@@ -5377,50 +3839,19 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg(ptr addrspace(1)
; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off offset:-2048
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB21_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
+; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
@@ -5477,73 +3908,24 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg(ptr addrspace(1)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v11, v5
-; GFX7-NEXT: v_mov_b32_e32 v10, v4
-; GFX7-NEXT: v_mov_b32_e32 v9, v3
-; GFX7-NEXT: v_mov_b32_e32 v8, v2
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v8
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v5, v9
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB21_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v5
-; GFX6-NEXT: v_mov_b32_e32 v10, v4
-; GFX6-NEXT: v_mov_b32_e32 v9, v3
-; GFX6-NEXT: v_mov_b32_e32 v8, v2
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v4, v8
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v5, v9
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB21_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 6548792..7f052e1 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -84,55 +84,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB0_3
+; GFX1064-NEXT: s_cbranch_execz .LBB0_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1064-NEXT: .LBB0_3:
+; GFX1064-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-NEXT: .LBB0_2:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB0_3
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB0_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1032-NEXT: .LBB0_3:
+; GFX1032-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-NEXT: .LBB0_2:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
@@ -142,60 +116,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_cbranch_execz .LBB0_3
+; GFX1164-NEXT: s_cbranch_execz .LBB0_2
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1164-NEXT: .LBB0_3:
+; GFX1164-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-NEXT: .LBB0_2:
+; GFX1164-NEXT: s_nop 0
+; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_cbranch_execz .LBB0_3
+; GFX1132-NEXT: s_cbranch_execz .LBB0_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1132-NEXT: .LBB0_3:
+; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-NEXT: .LBB0_2:
+; GFX1132-NEXT: s_nop 0
+; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
@@ -233,55 +180,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1064-DPP-NEXT: .LBB0_3:
+; GFX1064-DPP-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-DPP-NEXT: .LBB0_2:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1032-DPP-NEXT: .LBB0_3:
+; GFX1032-DPP-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-DPP-NEXT: .LBB0_2:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
@@ -291,60 +212,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1164-DPP-NEXT: .LBB0_3:
+; GFX1164-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT: .LBB0_2:
+; GFX1164-DPP-NEXT: s_nop 0
+; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1132-DPP-NEXT: .LBB0_3:
+; GFX1132-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT: .LBB0_2:
+; GFX1132-DPP-NEXT: s_nop 0
+; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-DPP-NEXT: s_endpgm
%result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4
ret void
@@ -501,18 +395,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1064-NEXT: s_mov_b32 s32, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0xff800000
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0xff800000
; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -520,27 +414,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execz .LBB1_5
+; GFX1064-NEXT: s_cbranch_execz .LBB1_4
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
-; GFX1064-NEXT: .LBB1_4: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB1_4
-; GFX1064-NEXT: .LBB1_5:
+; GFX1064-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-NEXT: .LBB1_4:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
@@ -571,45 +451,31 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1032-NEXT: s_mov_b32 s32, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0xff800000
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0xff800000
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1032-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execz .LBB1_5
+; GFX1032-NEXT: s_cbranch_execz .LBB1_4
; GFX1032-NEXT: ; %bb.3:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
-; GFX1032-NEXT: .LBB1_4: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB1_4
-; GFX1032-NEXT: .LBB1_5:
+; GFX1032-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-NEXT: .LBB1_4:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
@@ -630,13 +496,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_mov_b32 s32, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0xff800000
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0xff800000
; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
@@ -644,7 +510,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -653,29 +519,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execz .LBB1_5
+; GFX1164-NEXT: s_cbranch_execz .LBB1_4
; GFX1164-NEXT: ; %bb.3:
; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1]
-; GFX1164-NEXT: .LBB1_4: ; %atomicrmw.start
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB1_4
-; GFX1164-NEXT: .LBB1_5:
+; GFX1164-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-NEXT: .LBB1_4:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
@@ -696,13 +546,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0xff800000
+; GFX1132-NEXT: v_mov_b32_e32 v1, 0xff800000
; GFX1132-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -710,36 +560,21 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
; GFX1132-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execz .LBB1_5
+; GFX1132-NEXT: s_cbranch_execz .LBB1_4
; GFX1132-NEXT: ; %bb.3:
; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2
+; GFX1132-NEXT: v_mov_b32_e32 v0, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1]
-; GFX1132-NEXT: .LBB1_4: ; %atomicrmw.start
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB1_4
-; GFX1132-NEXT: .LBB1_5:
+; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-NEXT: .LBB1_4:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
@@ -904,27 +739,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT: v_max_f32_e32 v6, v0, v0
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1]
-; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v6
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2
-; GFX1064-DPP-NEXT: .LBB1_3:
+; GFX1064-DPP-NEXT: global_atomic_fmax v1, v0, s[0:1]
+; GFX1064-DPP-NEXT: .LBB1_2:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
@@ -986,29 +807,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT: v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1]
-; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v6
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2
-; GFX1032-DPP-NEXT: .LBB1_3:
+; GFX1032-DPP-NEXT: global_atomic_fmax v1, v0, s[0:1]
+; GFX1032-DPP-NEXT: .LBB1_2:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
@@ -1076,34 +883,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT: v_max_f32_e32 v6, v4, v4
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: global_load_b32 v5, v0, s[0:1]
-; GFX1164-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_2
-; GFX1164-DPP-NEXT: .LBB1_3:
+; GFX1164-DPP-NEXT: global_atomic_max_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT: .LBB1_2:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
@@ -1159,34 +950,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT: v_max_f32_e32 v6, v4, v4
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: global_load_b32 v5, v0, s[0:1]
-; GFX1132-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_2
-; GFX1132-DPP-NEXT: .LBB1_3:
+; GFX1132-DPP-NEXT: global_atomic_max_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT: .LBB1_2:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
%result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4
@@ -3626,59 +3401,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB6_3
+; GFX1064-NEXT: s_cbranch_execz .LBB6_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB6_2: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1064-NEXT: .LBB6_3:
+; GFX1064-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: .LBB6_2:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB6_3
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB6_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
-; GFX1032-NEXT: .LBB6_2: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1032-NEXT: .LBB6_3:
+; GFX1032-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: .LBB6_2:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
@@ -3783,59 +3530,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1064-DPP-NEXT: .LBB6_3:
+; GFX1064-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT: .LBB6_2:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
-; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1032-DPP-NEXT: .LBB6_3:
+; GFX1032-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT: .LBB6_2:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
@@ -4039,23 +3758,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
@@ -4087,23 +3790,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
@@ -4261,23 +3948,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
@@ -4309,23 +3980,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 6936cdc..a9f49ad 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -84,55 +84,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB0_3
+; GFX1064-NEXT: s_cbranch_execz .LBB0_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1064-NEXT: .LBB0_3:
+; GFX1064-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-NEXT: .LBB0_2:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB0_3
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB0_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1032-NEXT: .LBB0_3:
+; GFX1032-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-NEXT: .LBB0_2:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
@@ -142,60 +116,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_cbranch_execz .LBB0_3
+; GFX1164-NEXT: s_cbranch_execz .LBB0_2
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1164-NEXT: .LBB0_3:
+; GFX1164-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-NEXT: .LBB0_2:
+; GFX1164-NEXT: s_nop 0
+; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_cbranch_execz .LBB0_3
+; GFX1132-NEXT: s_cbranch_execz .LBB0_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1132-NEXT: .LBB0_3:
+; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-NEXT: .LBB0_2:
+; GFX1132-NEXT: s_nop 0
+; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
@@ -233,55 +180,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1064-DPP-NEXT: .LBB0_3:
+; GFX1064-DPP-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-DPP-NEXT: .LBB0_2:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1032-DPP-NEXT: .LBB0_3:
+; GFX1032-DPP-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-DPP-NEXT: .LBB0_2:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
@@ -291,60 +212,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1164-DPP-NEXT: .LBB0_3:
+; GFX1164-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT: .LBB0_2:
+; GFX1164-DPP-NEXT: s_nop 0
+; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2
-; GFX1132-DPP-NEXT: .LBB0_3:
+; GFX1132-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT: .LBB0_2:
+; GFX1132-DPP-NEXT: s_nop 0
+; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-DPP-NEXT: s_endpgm
%result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4
ret void
@@ -501,18 +395,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1064-NEXT: s_mov_b32 s32, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7f800000
; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -520,27 +414,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execz .LBB1_5
+; GFX1064-NEXT: s_cbranch_execz .LBB1_4
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
-; GFX1064-NEXT: .LBB1_4: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB1_4
-; GFX1064-NEXT: .LBB1_5:
+; GFX1064-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-NEXT: .LBB1_4:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
@@ -571,45 +451,31 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1032-NEXT: s_mov_b32 s32, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7f800000
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1032-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execz .LBB1_5
+; GFX1032-NEXT: s_cbranch_execz .LBB1_4
; GFX1032-NEXT: ; %bb.3:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
-; GFX1032-NEXT: .LBB1_4: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB1_4
-; GFX1032-NEXT: .LBB1_5:
+; GFX1032-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-NEXT: .LBB1_4:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
@@ -630,13 +496,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_mov_b32 s32, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7f800000
; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
@@ -644,7 +510,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -653,29 +519,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execz .LBB1_5
+; GFX1164-NEXT: s_cbranch_execz .LBB1_4
; GFX1164-NEXT: ; %bb.3:
; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1]
-; GFX1164-NEXT: .LBB1_4: ; %atomicrmw.start
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB1_4
-; GFX1164-NEXT: .LBB1_5:
+; GFX1164-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-NEXT: .LBB1_4:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
@@ -696,13 +546,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7f800000
; GFX1132-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -710,36 +560,21 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
; GFX1132-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execz .LBB1_5
+; GFX1132-NEXT: s_cbranch_execz .LBB1_4
; GFX1132-NEXT: ; %bb.3:
; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2
+; GFX1132-NEXT: v_mov_b32_e32 v0, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1]
-; GFX1132-NEXT: .LBB1_4: ; %atomicrmw.start
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB1_4
-; GFX1132-NEXT: .LBB1_5:
+; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-NEXT: .LBB1_4:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
@@ -904,27 +739,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT: v_max_f32_e32 v6, v0, v0
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1]
-; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT: v_min_f32_e32 v0, v0, v6
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2
-; GFX1064-DPP-NEXT: .LBB1_3:
+; GFX1064-DPP-NEXT: global_atomic_fmin v1, v0, s[0:1]
+; GFX1064-DPP-NEXT: .LBB1_2:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
@@ -986,29 +807,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT: v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1]
-; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT: v_min_f32_e32 v0, v0, v6
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2
-; GFX1032-DPP-NEXT: .LBB1_3:
+; GFX1032-DPP-NEXT: global_atomic_fmin v1, v0, s[0:1]
+; GFX1032-DPP-NEXT: .LBB1_2:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
@@ -1076,34 +883,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT: v_max_f32_e32 v6, v4, v4
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: global_load_b32 v5, v0, s[0:1]
-; GFX1164-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_min_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_2
-; GFX1164-DPP-NEXT: .LBB1_3:
+; GFX1164-DPP-NEXT: global_atomic_min_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT: .LBB1_2:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
@@ -1159,34 +950,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT: v_max_f32_e32 v6, v4, v4
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: global_load_b32 v5, v0, s[0:1]
-; GFX1132-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_min_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_2
-; GFX1132-DPP-NEXT: .LBB1_3:
+; GFX1132-DPP-NEXT: global_atomic_min_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT: .LBB1_2:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
%result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4
@@ -3626,59 +3401,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB6_3
+; GFX1064-NEXT: s_cbranch_execz .LBB6_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB6_2: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1064-NEXT: .LBB6_3:
+; GFX1064-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: .LBB6_2:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB6_3
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB6_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
-; GFX1032-NEXT: .LBB6_2: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1032-NEXT: .LBB6_3:
+; GFX1032-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: .LBB6_2:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
@@ -3783,59 +3530,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1064-DPP-NEXT: .LBB6_3:
+; GFX1064-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT: .LBB6_2:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
-; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1032-DPP-NEXT: .LBB6_3:
+; GFX1032-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT: .LBB6_2:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
@@ -4039,23 +3758,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
@@ -4087,23 +3790,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
@@ -4261,23 +3948,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
@@ -4309,23 +3980,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
new file mode 100644
index 0000000..1fb5d53
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
@@ -0,0 +1,638 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
+; Not supported in gfx8 or gfx9
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], s4 idxen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc slc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc slc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc slc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen glc slc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+ ret float %ret
+}
+
+define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ ret void
+}
+
+define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret void
+}
+
+; Natural mapping, no voffset
+define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], s4 idxen
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s4 idxen
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+ ret void
+}
+
+define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen slc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen slc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen slc
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen slc
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+ ret void
+}
+
+; Test waterfall loop on resource
+define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[12:13], exec
+; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v1
+; GFX6-NEXT: v_readfirstlane_b32 s9, v2
+; GFX6-NEXT: v_readfirstlane_b32 s10, v3
+; GFX6-NEXT: v_readfirstlane_b32 s11, v4
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4]
+; GFX6-NEXT: s_and_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc
+; GFX6-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX6-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX6-NEXT: s_xor_b64 exec, exec, s[6:7]
+; GFX6-NEXT: s_cbranch_execnz .LBB8_1
+; GFX6-NEXT: ; %bb.2:
+; GFX6-NEXT: s_mov_b64 exec, s[12:13]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[12:13], exec
+; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v1
+; GFX7-NEXT: v_readfirstlane_b32 s9, v2
+; GFX7-NEXT: v_readfirstlane_b32 s10, v3
+; GFX7-NEXT: v_readfirstlane_b32 s11, v4
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4]
+; GFX7-NEXT: s_and_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc
+; GFX7-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX7-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX7-NEXT: s_xor_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB8_1
+; GFX7-NEXT: ; %bb.2:
+; GFX7-NEXT: s_mov_b64 exec, s[12:13]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_readfirstlane_b32 s8, v1
+; GFX10-NEXT: v_readfirstlane_b32 s9, v2
+; GFX10-NEXT: v_readfirstlane_b32 s10, v3
+; GFX10-NEXT: v_readfirstlane_b32 s11, v4
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[1:2]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[3:4]
+; GFX10-NEXT: s_and_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_and_saveexec_b32 s5, s5
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc
+; GFX10-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX10-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_cbranch_execnz .LBB8_1
+; GFX10-NEXT: ; %bb.2:
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-NEXT: v_readfirstlane_b32 s5, v2
+; GFX11-NEXT: v_readfirstlane_b32 s6, v3
+; GFX11-NEXT: v_readfirstlane_b32 s7, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX11-NEXT: s_and_saveexec_b32 s1, s1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_atomic_max_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 glc
+; GFX11-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12-NEXT: v_readfirstlane_b32 s5, v2
+; GFX12-NEXT: v_readfirstlane_b32 s6, v3
+; GFX12-NEXT: v_readfirstlane_b32 s7, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_and_saveexec_b32 s1, s1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-NEXT: ; %bb.2:
+; GFX12-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+; Test waterfall loop on soffset
+define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[6:7], exec
+; GFX6-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v1
+; GFX6-NEXT: v_readfirstlane_b32 s9, v2
+; GFX6-NEXT: v_readfirstlane_b32 s10, v3
+; GFX6-NEXT: v_readfirstlane_b32 s11, v4
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4]
+; GFX6-NEXT: v_readfirstlane_b32 s12, v7
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s12, v7
+; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], vcc
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s12 idxen offen offset:256 glc
+; GFX6-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX6-NEXT: ; implicit-def: $vgpr7
+; GFX6-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB9_1
+; GFX6-NEXT: ; %bb.2:
+; GFX6-NEXT: s_mov_b64 exec, s[6:7]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[6:7], exec
+; GFX7-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v1
+; GFX7-NEXT: v_readfirstlane_b32 s9, v2
+; GFX7-NEXT: v_readfirstlane_b32 s10, v3
+; GFX7-NEXT: v_readfirstlane_b32 s11, v4
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4]
+; GFX7-NEXT: v_readfirstlane_b32 s12, v7
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, s12, v7
+; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], vcc
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s12 idxen offen offset:256 glc
+; GFX7-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX7-NEXT: ; implicit-def: $vgpr7
+; GFX7-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB9_1
+; GFX7-NEXT: ; %bb.2:
+; GFX7-NEXT: s_mov_b64 exec, s[6:7]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_readfirstlane_b32 s8, v1
+; GFX10-NEXT: v_readfirstlane_b32 s9, v2
+; GFX10-NEXT: v_readfirstlane_b32 s10, v3
+; GFX10-NEXT: v_readfirstlane_b32 s11, v4
+; GFX10-NEXT: v_readfirstlane_b32 s7, v7
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[1:2]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[3:4]
+; GFX10-NEXT: v_cmp_eq_u32_e64 s5, s7, v7
+; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_and_b32 s4, s4, s5
+; GFX10-NEXT: s_and_saveexec_b32 s4, s4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s7 idxen offen offset:256 glc
+; GFX10-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX10-NEXT: ; implicit-def: $vgpr7
+; GFX10-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB9_1
+; GFX10-NEXT: ; %bb.2:
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-NEXT: v_readfirstlane_b32 s5, v2
+; GFX11-NEXT: v_readfirstlane_b32 s6, v3
+; GFX11-NEXT: v_readfirstlane_b32 s7, v4
+; GFX11-NEXT: v_readfirstlane_b32 s3, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_atomic_max_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 glc
+; GFX11-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX11-NEXT: ; implicit-def: $vgpr7
+; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB9_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12-NEXT: v_readfirstlane_b32 s5, v2
+; GFX12-NEXT: v_readfirstlane_b32 s6, v3
+; GFX12-NEXT: v_readfirstlane_b32 s7, v4
+; GFX12-NEXT: v_readfirstlane_b32 s3, v7
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_b32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX12-NEXT: ; implicit-def: $vgpr7
+; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB9_1
+; GFX12-NEXT: ; %bb.2:
+; GFX12-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+declare float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll
new file mode 100644
index 0000000..b859147
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll
@@ -0,0 +1,271 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
+; Not supported in gfx8 or gfx9, except 90a/940
+; xUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90A %s
+; xUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX940 %s
+
+define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ ret double %ret
+}
+
+define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret double %ret
+}
+
+define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+ ret double %ret
+}
+
+define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+ ret double %ret
+}
+
+define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ ret void
+}
+
+define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret void
+}
+
+; Natural mapping, no voffset
+define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+ ret void
+}
+
+define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+ ret void
+}
+
+; Test waterfall loop on resource
+define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[12:13], exec
+; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v2
+; GFX6-NEXT: v_readfirstlane_b32 s9, v3
+; GFX6-NEXT: v_readfirstlane_b32 s10, v4
+; GFX6-NEXT: v_readfirstlane_b32 s11, v5
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5]
+; GFX6-NEXT: s_and_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc
+; GFX6-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX6-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX6-NEXT: s_xor_b64 exec, exec, s[6:7]
+; GFX6-NEXT: s_cbranch_execnz .LBB8_1
+; GFX6-NEXT: ; %bb.2:
+; GFX6-NEXT: s_mov_b64 exec, s[12:13]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[12:13], exec
+; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v2
+; GFX7-NEXT: v_readfirstlane_b32 s9, v3
+; GFX7-NEXT: v_readfirstlane_b32 s10, v4
+; GFX7-NEXT: v_readfirstlane_b32 s11, v5
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5]
+; GFX7-NEXT: s_and_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc
+; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX7-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX7-NEXT: s_xor_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB8_1
+; GFX7-NEXT: ; %bb.2:
+; GFX7-NEXT: s_mov_b64 exec, s[12:13]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret double %ret
+}
+
+; Test waterfall loop on soffset
+define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__vgpr_soffset(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__vgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[6:7], exec
+; GFX6-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v2
+; GFX6-NEXT: v_readfirstlane_b32 s9, v3
+; GFX6-NEXT: v_readfirstlane_b32 s10, v4
+; GFX6-NEXT: v_readfirstlane_b32 s11, v5
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5]
+; GFX6-NEXT: v_readfirstlane_b32 s12, v8
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s12, v8
+; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], vcc
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s12 idxen offen offset:256 glc
+; GFX6-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX6-NEXT: ; implicit-def: $vgpr8
+; GFX6-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB9_1
+; GFX6-NEXT: ; %bb.2:
+; GFX6-NEXT: s_mov_b64 exec, s[6:7]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__vgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[6:7], exec
+; GFX7-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v2
+; GFX7-NEXT: v_readfirstlane_b32 s9, v3
+; GFX7-NEXT: v_readfirstlane_b32 s10, v4
+; GFX7-NEXT: v_readfirstlane_b32 s11, v5
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5]
+; GFX7-NEXT: v_readfirstlane_b32 s12, v8
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, s12, v8
+; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], vcc
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s12 idxen offen offset:256 glc
+; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX7-NEXT: ; implicit-def: $vgpr8
+; GFX7-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB9_1
+; GFX7-NEXT: ; %bb.2:
+; GFX7-NEXT: s_mov_b64 exec, s[6:7]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret double %ret
+}
+
+declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
new file mode 100644
index 0000000..87055db
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
@@ -0,0 +1,638 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
+; Not supported in gfx8 or gfx9
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], s4 idxen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc slc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc slc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc slc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen glc slc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+ ret float %ret
+}
+
+define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ ret void
+}
+
+define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret void
+}
+
+; Natural mapping, no voffset
+define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], s4 idxen
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s4 idxen
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+ ret void
+}
+
+define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen slc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen slc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen slc
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen slc
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+ ret void
+}
+
+; Test waterfall loop on resource
+define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[12:13], exec
+; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v1
+; GFX6-NEXT: v_readfirstlane_b32 s9, v2
+; GFX6-NEXT: v_readfirstlane_b32 s10, v3
+; GFX6-NEXT: v_readfirstlane_b32 s11, v4
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4]
+; GFX6-NEXT: s_and_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc
+; GFX6-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX6-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX6-NEXT: s_xor_b64 exec, exec, s[6:7]
+; GFX6-NEXT: s_cbranch_execnz .LBB8_1
+; GFX6-NEXT: ; %bb.2:
+; GFX6-NEXT: s_mov_b64 exec, s[12:13]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[12:13], exec
+; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v1
+; GFX7-NEXT: v_readfirstlane_b32 s9, v2
+; GFX7-NEXT: v_readfirstlane_b32 s10, v3
+; GFX7-NEXT: v_readfirstlane_b32 s11, v4
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4]
+; GFX7-NEXT: s_and_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc
+; GFX7-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX7-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX7-NEXT: s_xor_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB8_1
+; GFX7-NEXT: ; %bb.2:
+; GFX7-NEXT: s_mov_b64 exec, s[12:13]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_readfirstlane_b32 s8, v1
+; GFX10-NEXT: v_readfirstlane_b32 s9, v2
+; GFX10-NEXT: v_readfirstlane_b32 s10, v3
+; GFX10-NEXT: v_readfirstlane_b32 s11, v4
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[1:2]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[3:4]
+; GFX10-NEXT: s_and_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_and_saveexec_b32 s5, s5
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc
+; GFX10-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX10-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_cbranch_execnz .LBB8_1
+; GFX10-NEXT: ; %bb.2:
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-NEXT: v_readfirstlane_b32 s5, v2
+; GFX11-NEXT: v_readfirstlane_b32 s6, v3
+; GFX11-NEXT: v_readfirstlane_b32 s7, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX11-NEXT: s_and_saveexec_b32 s1, s1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_atomic_min_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 glc
+; GFX11-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12-NEXT: v_readfirstlane_b32 s5, v2
+; GFX12-NEXT: v_readfirstlane_b32 s6, v3
+; GFX12-NEXT: v_readfirstlane_b32 s7, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_and_saveexec_b32 s1, s1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-NEXT: ; %bb.2:
+; GFX12-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+; Test waterfall loop on soffset
+define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[6:7], exec
+; GFX6-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v1
+; GFX6-NEXT: v_readfirstlane_b32 s9, v2
+; GFX6-NEXT: v_readfirstlane_b32 s10, v3
+; GFX6-NEXT: v_readfirstlane_b32 s11, v4
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4]
+; GFX6-NEXT: v_readfirstlane_b32 s12, v7
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s12, v7
+; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], vcc
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s12 idxen offen offset:256 glc
+; GFX6-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX6-NEXT: ; implicit-def: $vgpr7
+; GFX6-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB9_1
+; GFX6-NEXT: ; %bb.2:
+; GFX6-NEXT: s_mov_b64 exec, s[6:7]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[6:7], exec
+; GFX7-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v1
+; GFX7-NEXT: v_readfirstlane_b32 s9, v2
+; GFX7-NEXT: v_readfirstlane_b32 s10, v3
+; GFX7-NEXT: v_readfirstlane_b32 s11, v4
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4]
+; GFX7-NEXT: v_readfirstlane_b32 s12, v7
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, s12, v7
+; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], vcc
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s12 idxen offen offset:256 glc
+; GFX7-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX7-NEXT: ; implicit-def: $vgpr7
+; GFX7-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB9_1
+; GFX7-NEXT: ; %bb.2:
+; GFX7-NEXT: s_mov_b64 exec, s[6:7]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_readfirstlane_b32 s8, v1
+; GFX10-NEXT: v_readfirstlane_b32 s9, v2
+; GFX10-NEXT: v_readfirstlane_b32 s10, v3
+; GFX10-NEXT: v_readfirstlane_b32 s11, v4
+; GFX10-NEXT: v_readfirstlane_b32 s7, v7
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[1:2]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[3:4]
+; GFX10-NEXT: v_cmp_eq_u32_e64 s5, s7, v7
+; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_and_b32 s4, s4, s5
+; GFX10-NEXT: s_and_saveexec_b32 s4, s4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s7 idxen offen offset:256 glc
+; GFX10-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX10-NEXT: ; implicit-def: $vgpr7
+; GFX10-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB9_1
+; GFX10-NEXT: ; %bb.2:
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-NEXT: v_readfirstlane_b32 s5, v2
+; GFX11-NEXT: v_readfirstlane_b32 s6, v3
+; GFX11-NEXT: v_readfirstlane_b32 s7, v4
+; GFX11-NEXT: v_readfirstlane_b32 s3, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_atomic_min_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 glc
+; GFX11-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX11-NEXT: ; implicit-def: $vgpr7
+; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB9_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12-NEXT: v_readfirstlane_b32 s5, v2
+; GFX12-NEXT: v_readfirstlane_b32 s6, v3
+; GFX12-NEXT: v_readfirstlane_b32 s7, v4
+; GFX12-NEXT: v_readfirstlane_b32 s3, v7
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_b32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX12-NEXT: ; implicit-def: $vgpr7
+; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB9_1
+; GFX12-NEXT: ; %bb.2:
+; GFX12-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret float %ret
+}
+
+declare float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll
new file mode 100644
index 0000000..5c23a86
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll
@@ -0,0 +1,271 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
+; Not supported in gfx8 or gfx9, except 90a/940
+; xUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90A %s
+; xUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX940 %s
+
+define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ ret double %ret
+}
+
+define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret double %ret
+}
+
+define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+ ret double %ret
+}
+
+define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+ ret double %ret
+}
+
+define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ ret void
+}
+
+define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret void
+}
+
+; Natural mapping, no voffset
+define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
+ ret void
+}
+
+define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+ ret void
+}
+
+; Test waterfall loop on resource
+define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[12:13], exec
+; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v2
+; GFX6-NEXT: v_readfirstlane_b32 s9, v3
+; GFX6-NEXT: v_readfirstlane_b32 s10, v4
+; GFX6-NEXT: v_readfirstlane_b32 s11, v5
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5]
+; GFX6-NEXT: s_and_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc
+; GFX6-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX6-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX6-NEXT: s_xor_b64 exec, exec, s[6:7]
+; GFX6-NEXT: s_cbranch_execnz .LBB8_1
+; GFX6-NEXT: ; %bb.2:
+; GFX6-NEXT: s_mov_b64 exec, s[12:13]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[12:13], exec
+; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v2
+; GFX7-NEXT: v_readfirstlane_b32 s9, v3
+; GFX7-NEXT: v_readfirstlane_b32 s10, v4
+; GFX7-NEXT: v_readfirstlane_b32 s11, v5
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5]
+; GFX7-NEXT: s_and_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc
+; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX7-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX7-NEXT: s_xor_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB8_1
+; GFX7-NEXT: ; %bb.2:
+; GFX7-NEXT: s_mov_b64 exec, s[12:13]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret double %ret
+}
+
+; Test waterfall loop on soffset
+define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) {
+; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[6:7], exec
+; GFX6-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v2
+; GFX6-NEXT: v_readfirstlane_b32 s9, v3
+; GFX6-NEXT: v_readfirstlane_b32 s10, v4
+; GFX6-NEXT: v_readfirstlane_b32 s11, v5
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5]
+; GFX6-NEXT: v_readfirstlane_b32 s12, v8
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s12, v8
+; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], vcc
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s12 idxen offen offset:256 glc
+; GFX6-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX6-NEXT: ; implicit-def: $vgpr8
+; GFX6-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB9_1
+; GFX6-NEXT: ; %bb.2:
+; GFX6-NEXT: s_mov_b64 exec, s[6:7]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[6:7], exec
+; GFX7-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v2
+; GFX7-NEXT: v_readfirstlane_b32 s9, v3
+; GFX7-NEXT: v_readfirstlane_b32 s10, v4
+; GFX7-NEXT: v_readfirstlane_b32 s11, v5
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5]
+; GFX7-NEXT: v_readfirstlane_b32 s12, v8
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, s12, v8
+; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], vcc
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s12 idxen offen offset:256 glc
+; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX7-NEXT: ; implicit-def: $vgpr8
+; GFX7-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB9_1
+; GFX7-NEXT: ; %bb.2:
+; GFX7-NEXT: s_mov_b64 exec, s[6:7]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %voffset.add = add i32 %voffset, 256
+ %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
+ ret double %ret
+}
+
+declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg)
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
index 31da626..7290a91 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
@@ -3350,17 +3350,17 @@ define float @test_atomicrmw_fmax_f32_global_agent(ptr addrspace(1) %ptr, float
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
ret float %res
@@ -3372,17 +3372,17 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret float %res
@@ -3394,17 +3394,17 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret float %res
@@ -3416,17 +3416,17 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret float %res
@@ -3530,17 +3530,17 @@ define float @test_atomicrmw_fmin_f32_global_agent(ptr addrspace(1) %ptr, float
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
ret float %res
@@ -3552,17 +3552,17 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret float %res
@@ -3574,17 +3574,17 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret float %res
@@ -3596,17 +3596,17 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret float %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
index 35c5463..05fb224 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
@@ -3350,17 +3350,17 @@ define float @test_atomicrmw_fmax_f32_global_system(ptr addrspace(1) %ptr, float
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst
ret float %res
@@ -3372,17 +3372,17 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
ret float %res
@@ -3394,17 +3394,17 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
ret float %res
@@ -3416,17 +3416,17 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret float %res
@@ -3530,17 +3530,17 @@ define float @test_atomicrmw_fmin_f32_global_system(ptr addrspace(1) %ptr, float
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst
ret float %res
@@ -3552,17 +3552,17 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
ret float %res
@@ -3574,17 +3574,17 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
ret float %res
@@ -3596,17 +3596,17 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[RES]]
+; COMMON-NEXT: ret float [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret float %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
index a5830bd..af6b7e0 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
@@ -1246,17 +1246,17 @@ define double @test_atomicrmw_fmax_f64_global_agent(ptr addrspace(1) %ptr, doubl
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst
ret double %res
@@ -1268,17 +1268,17 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret double %res
@@ -1290,17 +1290,17 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret double %res
@@ -1312,17 +1312,17 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret double %res
@@ -1462,17 +1462,17 @@ define double @test_atomicrmw_fmin_f64_global_agent(ptr addrspace(1) %ptr, doubl
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst
ret double %res
@@ -1484,17 +1484,17 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret double %res
@@ -1506,17 +1506,17 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret double %res
@@ -1528,17 +1528,17 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret double %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
index 4489b63..69d65e6 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
@@ -1246,17 +1246,17 @@ define double @test_atomicrmw_fmax_f64_global_system(ptr addrspace(1) %ptr, doub
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst
ret double %res
@@ -1268,17 +1268,17 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0
ret double %res
@@ -1290,17 +1290,17 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory !0
ret double %res
@@ -1312,17 +1312,17 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret double %res
@@ -1462,17 +1462,17 @@ define double @test_atomicrmw_fmin_f64_global_system(ptr addrspace(1) %ptr, doub
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst
ret double %res
@@ -1484,17 +1484,17 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0
ret double %res
@@ -1506,17 +1506,17 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory !0
ret double %res
@@ -1528,17 +1528,17 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[RES]]
+; COMMON-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret double %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
index 387bec7..2a5e1bd 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=atomic-expand %s | FileCheck -check-prefix=GCN %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s | FileCheck -check-prefix=GCN %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=atomic-expand %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s | FileCheck -check-prefixes=GCN,GFX9 %s
define float @test_atomicrmw_fmax_f32_flat(ptr %ptr, float %value) {
; GCN-LABEL: @test_atomicrmw_fmax_f32_flat(
@@ -257,3 +257,6 @@ define double @test_atomicrmw_fmax_f64_global_strictfp(ptr addrspace(1) %ptr, do
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst
ret double %res
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX7: {{.*}}
+; GFX9: {{.*}}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
index e7c8faa..0fa409b 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=atomic-expand %s | FileCheck -check-prefix=GCN %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s | FileCheck -check-prefix=GCN %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=atomic-expand %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s | FileCheck -check-prefixes=GCN,GFX9 %s
define float @test_atomicrmw_fmin_f32_flat(ptr %ptr, float %value) {
; GCN-LABEL: @test_atomicrmw_fmin_f32_flat(
@@ -257,3 +257,6 @@ define double @test_atomicrmw_fmin_f64_global_strictfp(ptr addrspace(1) %ptr, do
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst
ret double %res
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX7: {{.*}}
+; GFX9: {{.*}}