aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll240
-rw-r--r--llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir24
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16-math.ll142
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16.ll119
-rw-r--r--llvm/test/CodeGen/AMDGPU/build_vector.ll28
-rw-r--r--llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll240
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll95
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll88
-rw-r--r--llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir32
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/large-avgpr-assign-last.mir94
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll40
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll61
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll907
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll1238
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll328
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll2278
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll1238
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll174
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll878
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll183
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll270
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll56
-rw-r--r--llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/packed-fp32.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll51
-rw-r--r--llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll42
-rw-r--r--llvm/test/CodeGen/AMDGPU/read-write-register-illegal-type.ll29
-rw-r--r--llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll36
-rw-r--r--llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll106
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-agpr.ll38
-rw-r--r--llvm/test/CodeGen/AMDGPU/ssubo.ll676
-rw-r--r--llvm/test/CodeGen/AMDGPU/structurize-hoist.ll180
-rw-r--r--llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll120
-rw-r--r--llvm/test/CodeGen/AMDGPU/uaddo.ll553
-rw-r--r--llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll47
-rw-r--r--llvm/test/CodeGen/AMDGPU/usubo.ll499
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll258
-rw-r--r--llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll2
45 files changed, 7639 insertions, 3897 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 8192d4a..0e132f1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -70,12 +70,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
@@ -85,12 +85,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -153,12 +153,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -168,12 +168,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -236,12 +236,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
@@ -251,12 +251,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -319,12 +319,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -334,12 +334,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -402,12 +402,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
@@ -417,12 +417,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -485,12 +485,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -500,12 +500,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -568,12 +568,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
@@ -583,12 +583,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -651,12 +651,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -666,12 +666,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -734,12 +734,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
@@ -749,12 +749,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -817,12 +817,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -832,12 +832,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -900,12 +900,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
@@ -915,12 +915,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -983,12 +983,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -998,12 +998,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir b/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir
index 1a457c9..9241a23 100644
--- a/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir
+++ b/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir
@@ -38,20 +38,20 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: early-clobber renamable $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: ("amdgpu-noclobber" load (s128), addrspace 1)
- ; CHECK-NEXT: renamable $vgpr4 = V_MOV_B32_e32 1065353216, implicit $exec
- ; CHECK-NEXT: renamable $vgpr5 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: renamable $vgpr6 = V_MOV_B32_e32 1073741824, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1065353216, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr4 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr1 = V_MOV_B32_e32 1073741824, implicit $exec
; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr4, $vgpr6, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: renamable $vgpr1 = COPY renamable $agpr1
- ; CHECK-NEXT: renamable $vgpr0 = COPY renamable $agpr0
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr4, $vgpr6, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: renamable $vgpr3 = COPY renamable $agpr1
- ; CHECK-NEXT: renamable $vgpr2 = COPY killed renamable $agpr0
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr4, killed $vgpr6, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr0, $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr6 = COPY renamable $agpr1
+ ; CHECK-NEXT: renamable $vgpr5 = COPY renamable $agpr0
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr0, $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr8 = COPY renamable $agpr1
+ ; CHECK-NEXT: renamable $vgpr7 = COPY killed renamable $agpr0
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr5_vgpr6_vgpr7_vgpr8
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr5, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr4, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: ("amdgpu-noclobber" load (s128), addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
index 1adf542..9979e83 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-math.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
@@ -323,6 +323,146 @@ define amdgpu_ps void @v_test_max_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat>
ret void
}
+define amdgpu_ps bfloat @test_clamp_bf16(bfloat %src) {
+; GCN-LABEL: test_clamp_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
+; GCN-NEXT: ; return to shader part epilog
+ %max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0)
+ %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
+ ret bfloat %clamp
+}
+
+define amdgpu_ps bfloat @test_clamp_bf16_s(bfloat inreg %src) {
+; GCN-LABEL: test_clamp_bf16_s:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp
+; GCN-NEXT: ; return to shader part epilog
+ %max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0)
+ %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
+ ret bfloat %clamp
+}
+
+define amdgpu_ps float @test_clamp_v2bf16(<2 x bfloat> %src) {
+; GCN-LABEL: test_clamp_v2bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
+; GCN-NEXT: ; return to shader part epilog
+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %src, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
+ %clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
+ %ret = bitcast <2 x bfloat> %clamp to float
+ ret float %ret
+}
+
+define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) {
+; GCN-LABEL: test_clamp_v2bf16_s:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp
+; GCN-NEXT: ; return to shader part epilog
+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %src, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
+ %clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
+ %ret = bitcast <2 x bfloat> %clamp to float
+ ret float %ret
+}
+
+define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
+; GCN-LABEL: test_clamp_bf16_folding:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_exp_bf16_e32 v0, v0
+; GCN-NEXT: v_nop
+; GCN-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
+; GCN-NEXT: ; return to shader part epilog
+ %exp = call bfloat @llvm.exp2.bf16(bfloat %src)
+ %max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0)
+ %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
+ ret bfloat %clamp
+}
+
+define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloat> %src1) {
+; GCN-LABEL: test_clamp_v2bf16_folding:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_bf16 v0, v0, v1
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
+; GCN-NEXT: ; return to shader part epilog
+ %mul = fmul <2 x bfloat> %src0, %src1
+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %mul, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
+ %clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
+ %ret = bitcast <2 x bfloat> %clamp to float
+ ret float %ret
+}
+
+define amdgpu_ps void @v_test_mul_add_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
+; GCN-LABEL: v_test_mul_add_v2bf16_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_bf16 v2, v2, v3
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_pk_add_bf16 v2, v2, v4
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul contract <2 x bfloat> %a, %b
+ %add = fadd contract <2 x bfloat> %mul, %c
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_mul_add_v2bf16_vss(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
+; GCN-LABEL: v_test_mul_add_v2bf16_vss:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_pk_add_bf16 v2, v2, s1
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul contract <2 x bfloat> %a, %b
+ %add = fadd contract <2 x bfloat> %mul, %c
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_mul_add_v2bf16_sss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
+; GCN-LABEL: v_test_mul_add_v2bf16_sss:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_bf16 v2, s0, s1
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_pk_add_bf16 v2, v2, s2
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul contract <2 x bfloat> %a, %b
+ %add = fadd contract <2 x bfloat> %mul, %c
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_mul_add_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_mul_add_v2bf16_vsc:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_pk_add_bf16 v2, v2, 0.5 op_sel_hi:[1,0]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul contract <2 x bfloat> %a, %b
+ %add = fadd contract <2 x bfloat> %mul, <bfloat 0.5, bfloat 0.5>
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_mul_add_v2bf16_vll(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_mul_add_v2bf16_vll:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_bf16 v2, 0x42c83f80, v2
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_pk_add_bf16 v2, 0x43484000, v2
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul contract <2 x bfloat> %a, <bfloat 1.0, bfloat 100.0>
+ %add = fadd contract <2 x bfloat> %mul, <bfloat 2.0, bfloat 200.0>
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_ps void @v_test_fma_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
; GCN-LABEL: v_test_fma_v2bf16_vvv:
; GCN: ; %bb.0:
@@ -426,6 +566,8 @@ define amdgpu_ps void @llvm_exp2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src
ret void
}
+declare bfloat @llvm.minnum.bf16(bfloat, bfloat)
+declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 7859fcdf..52e697c 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -468,15 +468,28 @@ define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_load_global_v16bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v9, v1
-; GFX9-NEXT: v_mov_b32_e32 v8, v0
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[8:9], off
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_load_global_v16bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v9, v1
+; GFX900-NEXT: v_mov_b32_e32 v8, v0
+; GFX900-NEXT: global_load_dwordx4 v[0:3], v[8:9], off
+; GFX900-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_load_global_v16bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
+; GFX950-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX950-NEXT: s_waitcnt vmcnt(1)
+; GFX950-NEXT: v_mov_b32_e32 v0, v8
+; GFX950-NEXT: v_mov_b32_e32 v1, v9
+; GFX950-NEXT: v_mov_b32_e32 v2, v10
+; GFX950-NEXT: v_mov_b32_e32 v3, v11
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_load_global_v16bf16:
; GFX10: ; %bb.0:
@@ -619,17 +632,32 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_load_global_v32bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v17, v1
-; GFX9-NEXT: v_mov_b32_e32 v16, v0
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[16:17], off
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v[16:17], off offset:16
-; GFX9-NEXT: global_load_dwordx4 v[8:11], v[16:17], off offset:32
-; GFX9-NEXT: global_load_dwordx4 v[12:15], v[16:17], off offset:48
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_load_global_v32bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v17, v1
+; GFX900-NEXT: v_mov_b32_e32 v16, v0
+; GFX900-NEXT: global_load_dwordx4 v[0:3], v[16:17], off
+; GFX900-NEXT: global_load_dwordx4 v[4:7], v[16:17], off offset:16
+; GFX900-NEXT: global_load_dwordx4 v[8:11], v[16:17], off offset:32
+; GFX900-NEXT: global_load_dwordx4 v[12:15], v[16:17], off offset:48
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_load_global_v32bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: global_load_dwordx4 v[16:19], v[0:1], off
+; GFX950-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX950-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32
+; GFX950-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48
+; GFX950-NEXT: s_waitcnt vmcnt(3)
+; GFX950-NEXT: v_mov_b32_e32 v0, v16
+; GFX950-NEXT: v_mov_b32_e32 v1, v17
+; GFX950-NEXT: v_mov_b32_e32 v2, v18
+; GFX950-NEXT: v_mov_b32_e32 v3, v19
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_load_global_v32bf16:
; GFX10: ; %bb.0:
@@ -877,22 +905,41 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_load_global_v64bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v29, v1
-; GFX9-NEXT: v_mov_b32_e32 v28, v0
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[28:29], off
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16
-; GFX9-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32
-; GFX9-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48
-; GFX9-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64
-; GFX9-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80
-; GFX9-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_load_global_v64bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v29, v1
+; GFX900-NEXT: v_mov_b32_e32 v28, v0
+; GFX900-NEXT: global_load_dwordx4 v[0:3], v[28:29], off
+; GFX900-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16
+; GFX900-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32
+; GFX900-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48
+; GFX900-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64
+; GFX900-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80
+; GFX900-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96
+; GFX900-NEXT: s_nop 0
+; GFX900-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_load_global_v64bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: global_load_dwordx4 v[32:35], v[0:1], off
+; GFX950-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX950-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32
+; GFX950-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48
+; GFX950-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:64
+; GFX950-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:80
+; GFX950-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:96
+; GFX950-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:112
+; GFX950-NEXT: s_waitcnt vmcnt(7)
+; GFX950-NEXT: v_mov_b32_e32 v0, v32
+; GFX950-NEXT: v_mov_b32_e32 v1, v33
+; GFX950-NEXT: v_mov_b32_e32 v2, v34
+; GFX950-NEXT: v_mov_b32_e32 v3, v35
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_load_global_v64bf16:
; GFX10: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 7208eae..763f436 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -51,11 +51,11 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) {
; GFX942-LABEL: build_vector2:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, 5
-; GFX942-NEXT: v_mov_b32_e32 v1, 6
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 5
+; GFX942-NEXT: v_mov_b32_e32 v3, 6
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
store <2 x i32> <i32 5, i32 6>, ptr addrspace(1) %out
@@ -116,13 +116,13 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) {
; GFX942-LABEL: build_vector4:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, 5
-; GFX942-NEXT: v_mov_b32_e32 v1, 6
-; GFX942-NEXT: v_mov_b32_e32 v2, 7
-; GFX942-NEXT: v_mov_b32_e32 v3, 8
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 5
+; GFX942-NEXT: v_mov_b32_e32 v3, 6
+; GFX942-NEXT: v_mov_b32_e32 v4, 7
+; GFX942-NEXT: v_mov_b32_e32 v5, 8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, ptr addrspace(1) %out
@@ -307,13 +307,13 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
; GFX942-LABEL: build_v2i32_from_v4i16_shuffle:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_lshl_b32 s3, s3, 16
; GFX942-NEXT: s_lshl_b32 s2, s2, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, s2
-; GFX942-NEXT: v_mov_b32_e32 v1, s3
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, s2
+; GFX942-NEXT: v_mov_b32_e32 v3, s3
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
%shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2>
diff --git a/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll b/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll
index 8781196..4f752d1 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll
@@ -8,10 +8,10 @@ define amdgpu_ps void @global_load_lds_dword_saddr(ptr addrspace(1) inreg nocapt
; GFX942-LABEL: global_load_lds_dword_saddr:
; GFX942: ; %bb.0: ; %main_body
; GFX942-NEXT: v_readfirstlane_b32 s2, v0
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: s_mov_b32 m0, s2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: global_load_lds_dword v2, s[0:1] offset:32 nt
+; GFX942-NEXT: global_load_lds_dword v1, s[0:1] offset:32 nt
; GFX942-NEXT: s_getpc_b64 s[0:1]
; GFX942-NEXT: s_add_u32 s0, s0, G@gotpcrel32@lo+4
; GFX942-NEXT: s_addc_u32 s1, s1, G@gotpcrel32@hi+12
@@ -21,9 +21,9 @@ define amdgpu_ps void @global_load_lds_dword_saddr(ptr addrspace(1) inreg nocapt
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_mul_i32 s3, s3, 10
; GFX942-NEXT: s_mul_i32 s2, s2, 10
-; GFX942-NEXT: v_mov_b32_e32 v0, s2
-; GFX942-NEXT: v_mov_b32_e32 v1, s3
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, s2
+; GFX942-NEXT: v_mov_b32_e32 v3, s3
+; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX90A-LABEL: global_load_lds_dword_saddr:
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 873fcee..6067194 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -71,12 +71,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
@@ -86,12 +86,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -154,12 +154,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -169,12 +169,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -237,12 +237,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
@@ -252,12 +252,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -320,12 +320,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -335,12 +335,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -403,12 +403,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
@@ -418,12 +418,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -486,12 +486,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -501,12 +501,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -569,12 +569,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
@@ -584,12 +584,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -652,12 +652,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -667,12 +667,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -735,12 +735,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
@@ -750,12 +750,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -818,12 +818,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -833,12 +833,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -901,12 +901,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
@@ -916,12 +916,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -984,12 +984,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -999,12 +999,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll
new file mode 100644
index 0000000..d1e82a0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL %s
+
+; Test that stores that may hit scratch are correctly promoted to SCOPE_SE.
+
+define void @test_scratch_store(ptr addrspace(5) %ptr, i32 %val) {
+; GCN-LABEL: test_scratch_store:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SE
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ store i32 %val, ptr addrspace(5) %ptr
+ ret void
+}
+
+define void @test_unknown_flat_store(ptr %ptr, i32 %val) {
+; GCN-LABEL: test_unknown_flat_store:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GCN-NEXT: s_wait_dscnt 0x0
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ store i32 %val, ptr %ptr
+ ret void
+}
+
+define void @test_flat_store_no_scratch_alloc(ptr %ptr, i32 %val) #0 {
+; GCN-LABEL: test_flat_store_no_scratch_alloc:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: flat_store_b32 v[0:1], v2
+; GCN-NEXT: s_wait_dscnt 0x0
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ store i32 %val, ptr %ptr
+ ret void
+}
+
+; TODO: handle
+define void @test_flat_store_noalias_addrspace(ptr %ptr, i32 %val) {
+; GCN-LABEL: test_flat_store_noalias_addrspace:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GCN-NEXT: s_wait_dscnt 0x0
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ store i32 %val, ptr %ptr, !noalias.addrspace !{i32 5, i32 6}
+ ret void
+}
+
+; TODO: would be nice to handle too
+define void @test_flat_store_select(ptr addrspace(1) %a, ptr addrspace(3) %b, i1 %cond, i32 %val) {
+; GCN-SDAG-LABEL: test_flat_store_select:
+; GCN-SDAG: ; %bb.0:
+; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
+; GCN-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v2
+; GCN-SDAG-NEXT: v_and_b32_e32 v3, 1, v3
+; GCN-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo
+; GCN-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, s1, vcc_lo
+; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GCN-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GCN-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v2, v0
+; GCN-SDAG-NEXT: flat_store_b32 v[0:1], v4 scope:SCOPE_SE
+; GCN-SDAG-NEXT: s_wait_dscnt 0x0
+; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GCN-GISEL-LABEL: test_flat_store_select:
+; GCN-GISEL: ; %bb.0:
+; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
+; GCN-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v2
+; GCN-GISEL-NEXT: v_and_b32_e32 v3, 1, v3
+; GCN-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GCN-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo
+; GCN-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, s1, vcc_lo
+; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GCN-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GCN-GISEL-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v5, v1
+; GCN-GISEL-NEXT: flat_store_b32 v[0:1], v4 scope:SCOPE_SE
+; GCN-GISEL-NEXT: s_wait_dscnt 0x0
+; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31]
+ %a.ascast = addrspacecast ptr addrspace(1) %a to ptr
+ %b.ascast = addrspacecast ptr addrspace(3) %b to ptr
+ %ptr = select i1 %cond, ptr %a.ascast, ptr %b.ascast
+ store i32 %val, ptr %ptr
+ ret void
+}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index fd644a3..3a898a9 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -124,27 +124,27 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
; GCN-SDAG-NEXT: s_clause 0xd
-; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:52
-; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:48
-; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:44
-; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 offset:40
-; GCN-SDAG-NEXT: scratch_store_b32 off, v44, s32 offset:36
-; GCN-SDAG-NEXT: scratch_store_b32 off, v45, s32 offset:32
-; GCN-SDAG-NEXT: scratch_store_b32 off, v56, s32 offset:28
-; GCN-SDAG-NEXT: scratch_store_b32 off, v57, s32 offset:24
-; GCN-SDAG-NEXT: scratch_store_b32 off, v58, s32 offset:20
-; GCN-SDAG-NEXT: scratch_store_b32 off, v59, s32 offset:16
-; GCN-SDAG-NEXT: scratch_store_b32 off, v60, s32 offset:12
-; GCN-SDAG-NEXT: scratch_store_b32 off, v61, s32 offset:8
-; GCN-SDAG-NEXT: scratch_store_b32 off, v62, s32 offset:4
-; GCN-SDAG-NEXT: scratch_store_b32 off, v63, s32
+; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:52 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:48 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:44 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 offset:40 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v44, s32 offset:36 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v45, s32 offset:32 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v63, s32 scope:SCOPE_SE
; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:224
; GCN-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:56 ; 16-byte Folded Spill
+; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:56 scope:SCOPE_SE ; 16-byte Folded Spill
; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:240
; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:72 ; 16-byte Folded Spill
+; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:72 scope:SCOPE_SE ; 16-byte Folded Spill
; GCN-SDAG-NEXT: s_clause 0xd
; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:192
; GCN-SDAG-NEXT: global_load_b128 v[14:17], v[0:1], off offset:208
@@ -206,27 +206,27 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
; GCN-GISEL-NEXT: s_clause 0xf
-; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:60
-; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:56
-; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:52
-; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:48
-; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:44
-; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 offset:40
-; GCN-GISEL-NEXT: scratch_store_b32 off, v46, s32 offset:36
-; GCN-GISEL-NEXT: scratch_store_b32 off, v47, s32 offset:32
-; GCN-GISEL-NEXT: scratch_store_b32 off, v56, s32 offset:28
-; GCN-GISEL-NEXT: scratch_store_b32 off, v57, s32 offset:24
-; GCN-GISEL-NEXT: scratch_store_b32 off, v58, s32 offset:20
-; GCN-GISEL-NEXT: scratch_store_b32 off, v59, s32 offset:16
-; GCN-GISEL-NEXT: scratch_store_b32 off, v60, s32 offset:12
-; GCN-GISEL-NEXT: scratch_store_b32 off, v61, s32 offset:8
-; GCN-GISEL-NEXT: scratch_store_b32 off, v62, s32 offset:4
-; GCN-GISEL-NEXT: scratch_store_b32 off, v63, s32
+; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:60 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:56 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:52 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:48 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:44 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 offset:40 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v46, s32 offset:36 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v47, s32 offset:32 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v63, s32 scope:SCOPE_SE
; GCN-GISEL-NEXT: s_wait_xcnt 0x8
; GCN-GISEL-NEXT: v_dual_mov_b32 v46, v3 :: v_dual_mov_b32 v47, v4
; GCN-GISEL-NEXT: global_load_b128 v[2:5], v[0:1], off offset:32
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT: scratch_store_b128 off, v[2:5], s32 offset:80 ; 16-byte Folded Spill
+; GCN-GISEL-NEXT: scratch_store_b128 off, v[2:5], s32 offset:80 scope:SCOPE_SE ; 16-byte Folded Spill
; GCN-GISEL-NEXT: s_clause 0xe
; GCN-GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:48
; GCN-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off offset:64
@@ -244,7 +244,7 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
; GCN-GISEL-NEXT: global_load_b128 v[60:63], v[0:1], off offset:16
; GCN-GISEL-NEXT: global_load_b128 v[0:3], v[0:1], off offset:240
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:64 ; 16-byte Folded Spill
+; GCN-GISEL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:64 scope:SCOPE_SE ; 16-byte Folded Spill
; GCN-GISEL-NEXT: scratch_load_b128 v[0:3], off, s32 offset:80 th:TH_LOAD_LU ; 16-byte Folded Reload
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
; GCN-GISEL-NEXT: s_clause 0xe
@@ -299,10 +299,10 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
; GCN-SDAG-NEXT: s_clause 0x3
-; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:12
-; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:8
-; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:4
-; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32
+; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:12 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:8 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:4 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 scope:SCOPE_SE
; GCN-SDAG-NEXT: s_clause 0x7
; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:112
; GCN-SDAG-NEXT: global_load_b128 v[18:21], v[0:1], off offset:96
@@ -385,12 +385,12 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
; GCN-GISEL-NEXT: s_clause 0x5
-; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:20
-; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:16
-; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:12
-; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:8
-; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:4
-; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32
+; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:20 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:16 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:12 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:8 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:4 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 scope:SCOPE_SE
; GCN-GISEL-NEXT: s_clause 0x7
; GCN-GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:80
; GCN-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off
diff --git a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
index 258aa9e..0a493e51 100644
--- a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
@@ -8,15 +8,15 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) {
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_ashr_i32 s7, s6, 31
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-NEXT: v_mov_b32_e32 v3, s3
; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3
; CHECK-NEXT: s_add_u32 s0, s2, s0
; CHECK-NEXT: s_addc_u32 s1, s3, s1
-; CHECK-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-NEXT: v_add_co_u32_e64 v2, vcc, -8, s0
-; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
-; CHECK-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: v_add_co_u32_e64 v0, vcc, -8, s0
+; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_wbinvl1_vol
; CHECK-NEXT: s_endpgm
@@ -35,15 +35,15 @@ define protected amdgpu_kernel void @InferFadd(i32 %a, ptr addrspace(1) %b, doub
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_ashr_i32 s7, s6, 31
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-NEXT: v_mov_b32_e32 v3, s3
; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3
; CHECK-NEXT: s_add_u32 s0, s0, s2
; CHECK-NEXT: s_addc_u32 s1, s1, s3
-; CHECK-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-NEXT: v_add_co_u32_e64 v2, vcc, -8, s0
-; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
-; CHECK-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: v_add_co_u32_e64 v0, vcc, -8, s0
+; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_wbinvl1_vol
; CHECK-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
index 11de6c8..06c3da0 100644
--- a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
@@ -32,32 +32,14 @@
# CHECK-NEXT: undef [[SPLIT0:%[0-9]+]].sub2_sub3:av_512_align2 = COPY undef $vgpr2_vgpr3 {
# CHECK-NEXT: internal [[SPLIT0]].sub0:av_512_align2 = COPY undef $vgpr0
# CHECK-NEXT: }
-# CHECK-NEXT: undef [[SPLIT1:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 {
-# CHECK-NEXT: internal [[SPLIT1]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0
-# CHECK-NEXT: }
-# CHECK-NEXT: undef [[SPLIT2:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT1]].sub2_sub3 {
-# CHECK-NEXT: internal [[SPLIT2]].sub0:av_512_align2 = COPY [[SPLIT1]].sub0
-# CHECK-NEXT: }
-# CHECK-NEXT: SI_SPILL_AV512_SAVE [[SPLIT2]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.1, align 4, addrspace 5)
-# CHECK-NEXT: [[RESTORE1:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
-# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub0_sub1:av_512_align2 = COPY [[RESTORE1]].sub0_sub1
-# CHECK-NEXT: [[RESTORE2:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.1, align 4, addrspace 5)
-# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[RESTORE2]].sub2_sub3 {
-# CHECK-NEXT: internal [[SPLIT3]].sub0:av_512_align2 = COPY [[RESTORE2]].sub0
-# CHECK-NEXT: }
-# CHECK-NEXT: undef [[SPLIT4:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT3]].sub2_sub3 {
-# CHECK-NEXT: internal [[SPLIT4]].sub0:av_512_align2 = COPY [[SPLIT3]].sub0
-# CHECK-NEXT: }
-# CHECK-NEXT: [[SPLIT5:%[0-9]+]].sub2:av_512_align2 = COPY [[SPLIT4]].sub3
-# CHECK-NEXT: undef [[SPLIT6:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT5]].sub0_sub1_sub2
-# CHECK-NEXT: undef [[SPLIT7:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT6]].sub0_sub1_sub2
-# CHECK-NEXT: undef [[SPLIT8:%[0-9]+]].sub0:av_512_align2 = COPY [[SPLIT4]].sub0 {
-# CHECK-NEXT: internal [[SPLIT8]].sub2:av_512_align2 = COPY [[SPLIT4]].sub2
+# CHECK-NEXT: undef [[SPLIT2:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT2]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0
# CHECK-NEXT: }
-# CHECK-NEXT: [[SPLIT9:%[0-9]+]].sub3:av_512_align2 = COPY [[SPLIT8]].sub2
-# CHECK-NEXT: undef [[SPLIT10:%[0-9]+]].sub0_sub1_sub2_sub3:av_512_align2 = COPY [[SPLIT9]].sub0_sub1_sub2_sub3
-# CHECK-NEXT: undef [[SPLIT13:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_512_align2 = COPY [[SPLIT10]].sub0_sub1_sub2_sub3
-# CHECK-NEXT: [[MFMA_USE1:%[0-9]+]].sub4:vreg_512_align2 = COPY [[SPLIT8]].sub0
+# CHECK-NEXT: [[RESTORE2:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: [[MFMA_USE1:%[0-9]+]].sub0_sub1:vreg_512_align2 = COPY [[RESTORE2]].sub0_sub1
+# CHECK-NEXT: [[MFMA_USE1]].sub2:vreg_512_align2 = COPY [[SPLIT2]].sub3
+# CHECK-NEXT: [[MFMA_USE1]].sub3:vreg_512_align2 = COPY [[SPLIT2]].sub2
+# CHECK-NEXT: [[MFMA_USE1]].sub4:vreg_512_align2 = COPY [[SPLIT2]].sub0
# CHECK-NEXT: [[MFMA_USE1]].sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
# CHECK-NEXT: [[MFMA_USE1]].sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
# CHECK-NEXT: [[MFMA_USE1]].sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
index 1ac75d3..d8c983a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
@@ -1331,16 +1331,16 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX942-NEXT: s_load_dword s6, s[4:5], 0x10
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v8, 5, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, 0x5040100
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 5, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0x5040100
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3]
-; GFX942-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v4, s[2:3] offset:16
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: v_perm_b32 v1, s6, v1, v9
+; GFX942-NEXT: v_perm_b32 v1, s6, v1, v5
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/large-avgpr-assign-last.mir b/llvm/test/CodeGen/AMDGPU/large-avgpr-assign-last.mir
new file mode 100644
index 0000000..58e9b0a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/large-avgpr-assign-last.mir
@@ -0,0 +1,94 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-regalloc -greedy-regclass-priority-trumps-globalness=1 -start-after=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck %s
+
+--- |
+ define void @temp_vgpr_to_agpr_should_not_undo_split_with_remat() #0 {
+ entry:
+ unreachable
+ }
+
+ attributes #0 = { "amdgpu-agpr-alloc"="0,0" }
+...
+
+
+---
+name: temp_vgpr_to_agpr_should_not_undo_split_with_remat
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
+ stackPtrOffsetReg: '$sgpr32'
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ workGroupIDX: { reg: '$sgpr6' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+ workItemIDX: { reg: '$vgpr0' }
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr4_sgpr5
+ ; CHECK-LABEL: name: temp_vgpr_to_agpr_should_not_undo_split_with_remat
+ ; CHECK: liveins: $vgpr0, $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+ ; CHECK-NEXT: dead renamable $vgpr1 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr1 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr3 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr4 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr5 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr6 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr7 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr8 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr9 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr10 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr11 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr12 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr13 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr14 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr15 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr16 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr17 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr18 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr19 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr20 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr21 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr22 = IMPLICIT_DEF
+ ; CHECK-NEXT: KILL killed renamable $vgpr2, killed renamable $vgpr3, killed renamable $vgpr4, killed renamable $vgpr5, killed renamable $vgpr6, killed renamable $vgpr7, killed renamable $vgpr8, killed renamable $vgpr9, killed renamable $vgpr10, killed renamable $vgpr11, killed renamable $vgpr12, killed renamable $vgpr13, killed renamable $vgpr14, killed renamable $vgpr15, killed renamable $vgpr16
+ ; CHECK-NEXT: S_NOP 0, implicit-def renamable $vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38
+ ; CHECK-NEXT: S_NOP 0, implicit-def renamable $vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54
+ ; CHECK-NEXT: KILL killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr17, killed renamable $vgpr18, killed renamable $vgpr19, killed renamable $vgpr20, killed renamable $vgpr21, killed renamable $vgpr22
+ ; CHECK-NEXT: S_NOP 0, implicit killed renamable $vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, implicit killed renamable $vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54
+ ; CHECK-NEXT: S_ENDPGM 0
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = IMPLICIT_DEF
+ %4:vgpr_32 = IMPLICIT_DEF
+ %5:vgpr_32 = IMPLICIT_DEF
+ %6:vgpr_32 = IMPLICIT_DEF
+ %7:vgpr_32 = IMPLICIT_DEF
+ %8:vgpr_32 = IMPLICIT_DEF
+ %9:vgpr_32 = IMPLICIT_DEF
+ %10:vgpr_32 = IMPLICIT_DEF
+ %11:vgpr_32 = IMPLICIT_DEF
+ %12:vgpr_32 = IMPLICIT_DEF
+ %13:vgpr_32 = IMPLICIT_DEF
+ %14:vgpr_32 = IMPLICIT_DEF
+ %15:vgpr_32 = IMPLICIT_DEF
+ %16:vgpr_32 = IMPLICIT_DEF
+ %17:vgpr_32 = IMPLICIT_DEF
+ %18:vgpr_32 = IMPLICIT_DEF
+ %19:vgpr_32 = IMPLICIT_DEF
+ %20:vgpr_32 = IMPLICIT_DEF
+ %21:vgpr_32 = IMPLICIT_DEF
+ %22:vgpr_32 = IMPLICIT_DEF
+ %23:vgpr_32 = IMPLICIT_DEF
+ KILL %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, %16, %17
+ S_NOP 0, implicit-def %50:av_512
+ S_NOP 0, implicit-def %51:av_512
+ KILL %1, %2, %18, %19, %20, %21, %22, %23
+ S_NOP 0, implicit %50, implicit %51
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll
index 85dd275..fcdad53 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll
@@ -4,30 +4,30 @@
define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(<1 x i64> %L1) {
; GCN-LABEL: test_iglp_opt_rev_mfma_gemm:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: v_mov_b32_e32 v32, 0
-; GCN-NEXT: ds_read_b128 v[0:3], v32
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: ds_read_b128 v[2:5], v0
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GCN-NEXT: ds_read_b128 v[28:31], v32 offset:112
-; GCN-NEXT: ds_read_b128 v[24:27], v32 offset:96
-; GCN-NEXT: ds_read_b128 v[20:23], v32 offset:80
-; GCN-NEXT: ds_read_b128 v[16:19], v32 offset:64
-; GCN-NEXT: ds_read_b128 v[4:7], v32 offset:16
-; GCN-NEXT: ds_read_b128 v[8:11], v32 offset:32
-; GCN-NEXT: ds_read_b128 v[12:15], v32 offset:48
+; GCN-NEXT: ds_read_b128 v[30:33], v0 offset:112
+; GCN-NEXT: ds_read_b128 v[26:29], v0 offset:96
+; GCN-NEXT: ds_read_b128 v[22:25], v0 offset:80
+; GCN-NEXT: ds_read_b128 v[18:21], v0 offset:64
+; GCN-NEXT: ds_read_b128 v[6:9], v0 offset:16
+; GCN-NEXT: ds_read_b128 v[10:13], v0 offset:32
+; GCN-NEXT: ds_read_b128 v[14:17], v0 offset:48
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: ds_write_b128 v32, v[0:3]
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: ds_write_b128 v0, v[2:5]
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: v_mov_b32_e32 v3, v2
; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0
; GCN-NEXT: ; iglp_opt mask(0x00000001)
-; GCN-NEXT: ds_write_b128 v32, v[28:31] offset:112
-; GCN-NEXT: ds_write_b128 v32, v[24:27] offset:96
-; GCN-NEXT: ds_write_b128 v32, v[20:23] offset:80
-; GCN-NEXT: ds_write_b128 v32, v[16:19] offset:64
-; GCN-NEXT: ds_write_b128 v32, v[12:15] offset:48
-; GCN-NEXT: ds_write_b128 v32, v[8:11] offset:32
-; GCN-NEXT: ds_write_b128 v32, v[4:7] offset:16
-; GCN-NEXT: ds_write_b64 v32, v[0:1]
+; GCN-NEXT: ds_write_b128 v0, v[30:33] offset:112
+; GCN-NEXT: ds_write_b128 v0, v[26:29] offset:96
+; GCN-NEXT: ds_write_b128 v0, v[22:25] offset:80
+; GCN-NEXT: ds_write_b128 v0, v[18:21] offset:64
+; GCN-NEXT: ds_write_b128 v0, v[14:17] offset:48
+; GCN-NEXT: ds_write_b128 v0, v[10:13] offset:32
+; GCN-NEXT: ds_write_b128 v0, v[6:9] offset:16
+; GCN-NEXT: ds_write_b64 v0, v[2:3]
; GCN-NEXT: s_endpgm
entry:
call void @llvm.amdgcn.iglp.opt(i32 1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
index ed7d88b..dcac419 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
@@ -18,19 +18,22 @@ define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1
; GCN-LABEL: load_1d_lwe:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: v_mov_b32_e32 v8, 0
-; GCN-NEXT: v_mov_b32_e32 v6, v0
; GCN-NEXT: v_mov_b32_e32 v9, v8
; GCN-NEXT: v_mov_b32_e32 v10, v8
; GCN-NEXT: v_mov_b32_e32 v11, v8
; GCN-NEXT: v_mov_b32_e32 v12, v8
-; GCN-NEXT: v_mov_b32_e32 v0, v8
-; GCN-NEXT: v_mov_b32_e32 v1, v9
-; GCN-NEXT: v_mov_b32_e32 v2, v10
-; GCN-NEXT: v_mov_b32_e32 v3, v11
-; GCN-NEXT: v_mov_b32_e32 v4, v12
-; GCN-NEXT: image_load v[0:4], v6, s[0:7] dmask:0xf unorm lwe
+; GCN-NEXT: v_mov_b32_e32 v2, v8
+; GCN-NEXT: v_mov_b32_e32 v3, v9
+; GCN-NEXT: v_mov_b32_e32 v4, v10
+; GCN-NEXT: v_mov_b32_e32 v5, v11
+; GCN-NEXT: v_mov_b32_e32 v6, v12
+; GCN-NEXT: image_load v[2:6], v0, s[0:7] dmask:0xf unorm lwe
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dword v8, v4, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v0, v2
+; GCN-NEXT: v_mov_b32_e32 v1, v3
+; GCN-NEXT: v_mov_b32_e32 v2, v4
+; GCN-NEXT: v_mov_b32_e32 v3, v5
+; GCN-NEXT: global_store_dword v8, v6, s[8:9]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
@@ -75,6 +78,27 @@ main_body:
}
define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) {
+; GCN-LABEL: load_cube_lwe:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: v_mov_b32_e32 v10, 0
+; GCN-NEXT: v_mov_b32_e32 v11, v10
+; GCN-NEXT: v_mov_b32_e32 v12, v10
+; GCN-NEXT: v_mov_b32_e32 v13, v10
+; GCN-NEXT: v_mov_b32_e32 v14, v10
+; GCN-NEXT: v_mov_b32_e32 v4, v10
+; GCN-NEXT: v_mov_b32_e32 v5, v11
+; GCN-NEXT: v_mov_b32_e32 v6, v12
+; GCN-NEXT: v_mov_b32_e32 v7, v13
+; GCN-NEXT: v_mov_b32_e32 v8, v14
+; GCN-NEXT: image_load v[4:8], v[0:2], s[0:7] dmask:0xf unorm lwe da
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, v4
+; GCN-NEXT: v_mov_b32_e32 v1, v5
+; GCN-NEXT: v_mov_b32_e32 v2, v6
+; GCN-NEXT: v_mov_b32_e32 v3, v7
+; GCN-NEXT: global_store_dword v10, v8, s[8:9]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
@@ -106,6 +130,27 @@ main_body:
}
define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) {
+; GCN-LABEL: load_2darray_lwe:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: v_mov_b32_e32 v10, 0
+; GCN-NEXT: v_mov_b32_e32 v11, v10
+; GCN-NEXT: v_mov_b32_e32 v12, v10
+; GCN-NEXT: v_mov_b32_e32 v13, v10
+; GCN-NEXT: v_mov_b32_e32 v14, v10
+; GCN-NEXT: v_mov_b32_e32 v4, v10
+; GCN-NEXT: v_mov_b32_e32 v5, v11
+; GCN-NEXT: v_mov_b32_e32 v6, v12
+; GCN-NEXT: v_mov_b32_e32 v7, v13
+; GCN-NEXT: v_mov_b32_e32 v8, v14
+; GCN-NEXT: image_load v[4:8], v[0:2], s[0:7] dmask:0xf unorm lwe da
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, v4
+; GCN-NEXT: v_mov_b32_e32 v1, v5
+; GCN-NEXT: v_mov_b32_e32 v2, v6
+; GCN-NEXT: v_mov_b32_e32 v3, v7
+; GCN-NEXT: global_store_dword v10, v8, s[8:9]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index 780c7e9..ff77d5cc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefixes=VGPR,GFX90A-VGPR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefixes=VGPR,GFX942-VGPR %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
@@ -127,6 +129,122 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[34:35]
; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[34:35] offset:16
; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_32x32x4bf16_1k:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v33, 0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v34, 1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v35, v33
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v32, 2
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX90A-VGPR-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, s16
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, s17
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, s18
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, s19
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, s20
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, s21
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, s22
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, s23
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, s24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v9, s25
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s26
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s27
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v12, s28
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v13, s29
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v14, s30
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v15, s31
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, s0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v17, s1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v18, s2
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v19, s3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v20, s4
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v21, s5
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v22, s6
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v23, s7
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v24, s8
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v25, s9
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v26, s10
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v27, s11
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v28, s12
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v29, s13
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v30, s14
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v31, s15
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f32_32x32x4bf16_1k v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 2
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[24:27], s[34:35] offset:96
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[28:31], s[34:35] offset:112
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[16:19], s[34:35] offset:64
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[20:23], s[34:35] offset:80
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[8:11], s[34:35] offset:32
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[12:15], s[34:35] offset:48
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[0:3], s[34:35]
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[4:7], s[34:35] offset:16
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x4bf16_1k:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v35, v33
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 2
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, s16
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, s17
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, s18
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, s19
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, s20
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, s21
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s22
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s23
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s25
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s26
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s27
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s28
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s29
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, s30
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, s31
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, s4
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v21, s5
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v22, s6
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v23, s7
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v24, s8
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v25, s9
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v26, s10
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v27, s11
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v28, s12
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, s13
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, s14
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 2
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[24:27], s[34:35] offset:96
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[28:31], s[34:35] offset:112
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[16:19], s[34:35] offset:64
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[20:23], s[34:35] offset:80
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[8:11], s[34:35] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[12:15], s[34:35] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[0:3], s[34:35]
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[4:7], s[34:35] offset:16
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%a = bitcast i64 1 to <4 x i16>
@@ -208,6 +326,62 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x4bf16_1k:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v17, 0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v18, 1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v19, v17
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, 2
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f32_16x16x4bf16_1k v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 2
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x4bf16_1k:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, v17
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 2
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_4b_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 2
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%a = bitcast i64 1 to <4 x i16>
@@ -257,6 +431,42 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0
; GFX942-NEXT: s_nop 4
; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, 1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 2
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f32_4x4x4bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: s_nop 4
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v5
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 2
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 4
+; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%a = bitcast i64 1 to <4 x i16>
@@ -339,6 +549,63 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_32x32x8bf16_1k:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v17, 0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v18, 1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v19, v17
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, 2
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f32_32x32x8bf16_1k v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 2
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8bf16_1k:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, v17
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 2
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 2
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%a = bitcast i64 1 to <4 x i16>
@@ -389,6 +656,43 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
; GFX942-NEXT: s_nop 6
; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, 1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 2
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f32_16x16x16bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 2
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v5
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 2
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 6
+; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%a = bitcast i64 1 to <4 x i16>
@@ -430,6 +734,38 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: global_store_dwordx2 v0, a[0:1], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_4x4x4f64:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[4:5], v[0:1], v[2:3], 0
+; GFX90A-VGPR-NEXT: s_nop 3
+; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_4x4x4f64:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[4:5], v[0:1], v[2:3], 0
+; GFX942-VGPR-NEXT: s_nop 3
+; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0)
%mai.2 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %mai.1, i32 1, i32 2, i32 3)
@@ -443,10 +779,10 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: v_mov_b32_e32 v2, s10
; GFX90A-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s11
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v3, s11
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
@@ -457,7 +793,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6
; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
@@ -471,10 +807,10 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: v_mov_b32_e32 v2, s10
; GFX942-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GFX942-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
@@ -485,7 +821,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX942-NEXT: v_accvgpr_write_b32 a6, s6
; GFX942-NEXT: v_accvgpr_write_b32 a7, s7
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
@@ -493,6 +829,54 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s10
+; GFX90A-VGPR-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s11
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 0
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s10
+; GFX942-VGPR-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s11
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <4 x double>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %in.1, i32 1, i32 2, i32 3)
@@ -536,6 +920,42 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1)
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_0:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 0
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 0
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_0:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 0
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> zeroinitializer, i32 0, i32 0, i32 0)
%mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -579,6 +999,42 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_neg1:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], -1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 0
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_neg1:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], -1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 -1 to double)), i32 0, i32 0, i32 0)
%mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -622,6 +1078,42 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1)
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_1:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 1.0
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 0
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_1:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 1.0
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double 1.0), i32 0, i32 0, i32 0)
%mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -665,6 +1157,42 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_neg1:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], -1.0
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 0
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_neg1:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], -1.0
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double -1.0), i32 0, i32 0, i32 0)
%mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -708,6 +1236,42 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 64
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 0
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 64
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 64 to double)), i32 0, i32 0, i32 0)
%mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -767,6 +1331,64 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 64
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 64
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 274877906944 to double)), i32 0, i32 0, i32 0)
%mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -826,6 +1448,58 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 64
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7]
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 0
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 64
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7]
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 274877907008 to double)), i32 0, i32 0, i32 0)
%mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -885,6 +1559,58 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_low:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7]
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 0
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_low:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7]
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (<2 x float> splat (float 1.0) to double)), i32 0, i32 0, i32 0)
%mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -898,20 +1624,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0x3ff00000
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x3ff00000
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s2
-; GFX90A-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7]
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
@@ -925,20 +1651,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, 0x3ff00000
-; GFX942-NEXT: v_accvgpr_write_b32 a7, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x3ff00000
+; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s2
-; GFX942-NEXT: v_mov_b32_e32 v1, s3
+; GFX942-NEXT: v_mov_b32_e32 v2, s2
+; GFX942-NEXT: v_mov_b32_e32 v3, s3
; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
@@ -946,6 +1672,64 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_imm:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v12, s2
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v13, s3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_imm:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
store <4 x double> %mai.1, ptr addrspace(1) %arg
@@ -957,21 +1741,21 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0x405ec000
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x405ec000
; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s2
-; GFX90A-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a1
; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a1
; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a1
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7]
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
@@ -984,21 +1768,21 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT: v_mov_b32_e32 v2, 0x405ec000
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x405ec000
; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s2
-; GFX942-NEXT: v_mov_b32_e32 v1, s3
+; GFX942-NEXT: v_mov_b32_e32 v2, s2
+; GFX942-NEXT: v_mov_b32_e32 v3, s3
; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1
; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a5, a1
; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a7, a1
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
@@ -1006,6 +1790,64 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 0x405ec000
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v12, s2
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v13, s3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0x405ec000
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0)
store <4 x double> %mai.1, ptr addrspace(1) %arg
@@ -1015,3 +1857,4 @@ bb:
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}
+; VGPR: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
index 92af34f..beda16c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
@@ -1,13 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-VGPRCD,GFX942-SDAG,GFX942-VGPRCD-SDAG %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-VGPRCD,GFX942-GISEL,GFX942-VGPRCD-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX942,GFX942-AGPRCD,GFX942-SDAG,GFX942-AGPRCD-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX942,GFX942-AGPRCD,GFX942-GISEL,GFX942-AGPRCD-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950,GFX950-VGPRCD,GFX950-SDAG,GFX950-VGPRCD-SDAG %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950,GFX950-VGPRCD,GFX950-GISEL,GFX950-VGPRCD-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX950,GFX950-AGPRCD,GFX950-SDAG,GFX950-AGPRCD-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX950,GFX950-AGPRCD,GFX950-GISEL,GFX950-AGPRCD-GISEL %s
declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64, i64, <4 x i32>, i32, i32, i32)
declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64, i64, <16 x i32>, i32, i32, i32)
@@ -35,26 +31,26 @@ declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32>, <4 x i3
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
-; GFX942-SDAG-LABEL: test_mfma_i32_16x16x32i8:
-; GFX942-SDAG: ; %bb.0: ; %bb
-; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-SDAG-NEXT: s_endpgm
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_i32_16x16x32i8:
; GFX942-GISEL: ; %bb.0: ; %bb
@@ -77,26 +73,47 @@ define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: test_mfma_i32_16x16x32i8:
-; GFX950-SDAG: ; %bb.0: ; %bb
-; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX950-SDAG-NEXT: s_endpgm
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_i32_16x16x32i8:
; GFX950-GISEL: ; %bb.0: ; %bb
@@ -118,6 +135,27 @@ define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
; GFX950-GISEL-NEXT: s_nop 6
; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x i32>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 4294967298, i64 12884901892, <4 x i32> %in.1, i32 1, i32 2, i32 3)
@@ -281,26 +319,26 @@ bb:
}
define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) #0 {
-; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
-; GFX942-SDAG: ; %bb.0: ; %bb
-; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-SDAG-NEXT: s_endpgm
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
; GFX942-GISEL: ; %bb.0: ; %bb
@@ -323,26 +361,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
-; GFX950-SDAG: ; %bb.0: ; %bb
-; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX950-SDAG-NEXT: s_endpgm
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
; GFX950-GISEL: ; %bb.0: ; %bb
@@ -364,6 +423,27 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: s_nop 6
; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -372,26 +452,26 @@ bb:
}
define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) #0 {
-; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
-; GFX942-SDAG: ; %bb.0: ; %bb
-; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-SDAG-NEXT: s_endpgm
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
; GFX942-GISEL: ; %bb.0: ; %bb
@@ -414,26 +494,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
-; GFX950-SDAG: ; %bb.0: ; %bb
-; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX950-SDAG-NEXT: s_endpgm
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
; GFX950-GISEL: ; %bb.0: ; %bb
@@ -455,6 +556,27 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: s_nop 6
; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -463,26 +585,26 @@ bb:
}
define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) #0 {
-; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
-; GFX942-SDAG: ; %bb.0: ; %bb
-; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-SDAG-NEXT: s_endpgm
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
; GFX942-GISEL: ; %bb.0: ; %bb
@@ -505,26 +627,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
-; GFX950-SDAG: ; %bb.0: ; %bb
-; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX950-SDAG-NEXT: s_endpgm
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
; GFX950-GISEL: ; %bb.0: ; %bb
@@ -546,6 +689,27 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: s_nop 6
; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -554,26 +718,26 @@ bb:
}
define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) #0 {
-; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
-; GFX942-SDAG: ; %bb.0: ; %bb
-; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-SDAG-NEXT: s_endpgm
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
; GFX942-GISEL: ; %bb.0: ; %bb
@@ -596,26 +760,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
-; GFX950-SDAG: ; %bb.0: ; %bb
-; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX950-SDAG-NEXT: s_endpgm
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
; GFX950-GISEL: ; %bb.0: ; %bb
@@ -637,6 +822,27 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: s_nop 6
; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -1269,20 +1475,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s6
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
@@ -1291,18 +1497,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s6
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16:
@@ -1332,20 +1538,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s6
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
@@ -1354,18 +1560,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s6
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16:
@@ -1681,20 +1887,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s6
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
@@ -1703,18 +1909,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s6
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16:
@@ -1744,20 +1950,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s6
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
@@ -1766,18 +1972,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s6
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16:
@@ -2093,23 +2299,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
@@ -2119,21 +2325,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
@@ -2197,23 +2403,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
@@ -2223,21 +2429,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
@@ -2309,16 +2515,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -2327,7 +2533,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
@@ -2461,16 +2667,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -2479,7 +2685,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
@@ -2619,23 +2825,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
@@ -2645,21 +2851,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
@@ -2723,23 +2929,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
@@ -2749,21 +2955,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
@@ -2834,23 +3040,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
@@ -2860,21 +3066,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
@@ -2938,23 +3144,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
@@ -2964,21 +3170,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
@@ -3049,23 +3255,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
@@ -3075,21 +3281,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
@@ -3153,23 +3359,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
@@ -3179,21 +3385,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
@@ -3264,23 +3470,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
@@ -3290,21 +3496,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
@@ -3368,23 +3574,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
@@ -3394,21 +3600,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
@@ -3480,16 +3686,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3498,7 +3704,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
@@ -3632,16 +3838,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3650,7 +3856,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
@@ -3791,16 +3997,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3809,7 +4015,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
@@ -3943,16 +4149,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3961,7 +4167,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
@@ -4102,16 +4308,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4120,7 +4326,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
@@ -4254,16 +4460,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4272,7 +4478,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
@@ -4413,16 +4619,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4431,7 +4637,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
@@ -4565,16 +4771,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4583,7 +4789,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
index 452033f..8081a15 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -15,9 +15,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GCN-NEXT: v_mov_b64_e32 v[12:13], 48
-; GCN-NEXT: v_mov_b64_e32 v[14:15], 32
-; GCN-NEXT: v_mov_b64_e32 v[16:17], 16
+; GCN-NEXT: v_mov_b64_e32 v[8:9], 48
+; GCN-NEXT: v_mov_b64_e32 v[10:11], 32
+; GCN-NEXT: v_mov_b64_e32 v[12:13], 16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -39,42 +39,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
; GCN-NEXT: v_accvgpr_write_b32 a13, s21
; GCN-NEXT: v_accvgpr_write_b32 a14, s22
; GCN-NEXT: v_accvgpr_write_b32 a15, s23
-; GCN-NEXT: v_mov_b64_e32 v[18:19], 0
-; GCN-NEXT: v_mov_b32_e32 v8, s16
+; GCN-NEXT: v_mov_b64_e32 v[14:15], 0
+; GCN-NEXT: v_mov_b32_e32 v16, s16
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15]
; GCN-NEXT: v_mov_b32_e32 v0, s20
; GCN-NEXT: v_mov_b32_e32 v1, s21
; GCN-NEXT: v_mov_b32_e32 v2, s22
; GCN-NEXT: v_mov_b32_e32 v3, s23
-; GCN-NEXT: v_mov_b32_e32 v9, s17
-; GCN-NEXT: v_mov_b32_e32 v10, s18
-; GCN-NEXT: v_mov_b32_e32 v11, s19
+; GCN-NEXT: v_mov_b32_e32 v17, s17
+; GCN-NEXT: v_mov_b32_e32 v18, s18
+; GCN-NEXT: v_mov_b32_e32 v19, s19
; GCN-NEXT: s_nop 4
-; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
@@ -88,9 +88,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GCN-NEXT: v_mov_b64_e32 v[12:13], 48
-; GCN-NEXT: v_mov_b64_e32 v[14:15], 32
-; GCN-NEXT: v_mov_b64_e32 v[16:17], 16
+; GCN-NEXT: v_mov_b64_e32 v[8:9], 48
+; GCN-NEXT: v_mov_b64_e32 v[10:11], 32
+; GCN-NEXT: v_mov_b64_e32 v[12:13], 16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -112,42 +112,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
; GCN-NEXT: v_accvgpr_write_b32 a13, s21
; GCN-NEXT: v_accvgpr_write_b32 a14, s22
; GCN-NEXT: v_accvgpr_write_b32 a15, s23
-; GCN-NEXT: v_mov_b64_e32 v[18:19], 0
-; GCN-NEXT: v_mov_b32_e32 v8, s16
+; GCN-NEXT: v_mov_b64_e32 v[14:15], 0
+; GCN-NEXT: v_mov_b32_e32 v16, s16
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
; GCN-NEXT: v_mov_b32_e32 v0, s20
; GCN-NEXT: v_mov_b32_e32 v1, s21
; GCN-NEXT: v_mov_b32_e32 v2, s22
; GCN-NEXT: v_mov_b32_e32 v3, s23
-; GCN-NEXT: v_mov_b32_e32 v9, s17
-; GCN-NEXT: v_mov_b32_e32 v10, s18
-; GCN-NEXT: v_mov_b32_e32 v11, s19
+; GCN-NEXT: v_mov_b32_e32 v17, s17
+; GCN-NEXT: v_mov_b32_e32 v18, s18
+; GCN-NEXT: v_mov_b32_e32 v19, s19
; GCN-NEXT: s_nop 4
-; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1)
@@ -252,62 +252,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v44, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GCN-NEXT: v_accvgpr_write_b32 a31, s23
-; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GCN-NEXT: v_accvgpr_write_b32 a30, s22
-; GCN-NEXT: v_accvgpr_write_b32 a29, s21
-; GCN-NEXT: v_accvgpr_write_b32 a28, s20
-; GCN-NEXT: v_accvgpr_write_b32 a27, s19
-; GCN-NEXT: v_accvgpr_write_b32 a26, s18
-; GCN-NEXT: v_accvgpr_write_b32 a25, s17
-; GCN-NEXT: v_accvgpr_write_b32 a24, s16
-; GCN-NEXT: v_accvgpr_write_b32 a23, s15
-; GCN-NEXT: v_accvgpr_write_b32 a22, s14
-; GCN-NEXT: v_accvgpr_write_b32 a21, s13
-; GCN-NEXT: v_accvgpr_write_b32 a20, s12
-; GCN-NEXT: v_accvgpr_write_b32 a19, s11
-; GCN-NEXT: v_accvgpr_write_b32 a18, s10
-; GCN-NEXT: v_accvgpr_write_b32 a17, s9
-; GCN-NEXT: v_accvgpr_write_b32 a16, s8
-; GCN-NEXT: v_mov_b32_e32 v8, s20
-; GCN-NEXT: v_mov_b32_e32 v9, s21
-; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31]
-; GCN-NEXT: v_mov_b32_e32 v10, s22
-; GCN-NEXT: v_mov_b32_e32 v11, s23
-; GCN-NEXT: v_mov_b32_e32 v0, s16
-; GCN-NEXT: v_mov_b32_e32 v1, s17
-; GCN-NEXT: v_mov_b32_e32 v2, s18
-; GCN-NEXT: v_mov_b32_e32 v3, s19
-; GCN-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v40, s20
+; GCN-NEXT: v_mov_b32_e32 v41, s21
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31]
+; GCN-NEXT: v_mov_b32_e32 v42, s22
+; GCN-NEXT: v_mov_b32_e32 v43, s23
+; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 2
+; GCN-NEXT: v_mov_b32_e32 v16, s16
+; GCN-NEXT: v_mov_b32_e32 v17, s17
+; GCN-NEXT: v_mov_b32_e32 v18, s18
+; GCN-NEXT: v_mov_b32_e32 v19, s19
+; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v0, s12
-; GCN-NEXT: v_mov_b32_e32 v1, s13
-; GCN-NEXT: v_mov_b32_e32 v2, s14
-; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT: v_mov_b32_e32 v16, s12
+; GCN-NEXT: v_mov_b32_e32 v17, s13
+; GCN-NEXT: v_mov_b32_e32 v18, s14
+; GCN-NEXT: v_mov_b32_e32 v19, s15
+; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NEXT: v_mov_b32_e32 v2, s10
-; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; GCN-NEXT: v_mov_b32_e32 v16, s8
+; GCN-NEXT: v_mov_b32_e32 v17, s9
+; GCN-NEXT: v_mov_b32_e32 v18, s10
+; GCN-NEXT: v_mov_b32_e32 v19, s11
+; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
@@ -322,62 +315,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v44, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GCN-NEXT: v_accvgpr_write_b32 a31, s23
-; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GCN-NEXT: v_accvgpr_write_b32 a30, s22
-; GCN-NEXT: v_accvgpr_write_b32 a29, s21
-; GCN-NEXT: v_accvgpr_write_b32 a28, s20
-; GCN-NEXT: v_accvgpr_write_b32 a27, s19
-; GCN-NEXT: v_accvgpr_write_b32 a26, s18
-; GCN-NEXT: v_accvgpr_write_b32 a25, s17
-; GCN-NEXT: v_accvgpr_write_b32 a24, s16
-; GCN-NEXT: v_accvgpr_write_b32 a23, s15
-; GCN-NEXT: v_accvgpr_write_b32 a22, s14
-; GCN-NEXT: v_accvgpr_write_b32 a21, s13
-; GCN-NEXT: v_accvgpr_write_b32 a20, s12
-; GCN-NEXT: v_accvgpr_write_b32 a19, s11
-; GCN-NEXT: v_accvgpr_write_b32 a18, s10
-; GCN-NEXT: v_accvgpr_write_b32 a17, s9
-; GCN-NEXT: v_accvgpr_write_b32 a16, s8
-; GCN-NEXT: v_mov_b32_e32 v8, s20
-; GCN-NEXT: v_mov_b32_e32 v9, s21
-; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; GCN-NEXT: v_mov_b32_e32 v10, s22
-; GCN-NEXT: v_mov_b32_e32 v11, s23
-; GCN-NEXT: v_mov_b32_e32 v0, s16
-; GCN-NEXT: v_mov_b32_e32 v1, s17
-; GCN-NEXT: v_mov_b32_e32 v2, s18
-; GCN-NEXT: v_mov_b32_e32 v3, s19
-; GCN-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v40, s20
+; GCN-NEXT: v_mov_b32_e32 v41, s21
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; GCN-NEXT: v_mov_b32_e32 v42, s22
+; GCN-NEXT: v_mov_b32_e32 v43, s23
+; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 2
+; GCN-NEXT: v_mov_b32_e32 v16, s16
+; GCN-NEXT: v_mov_b32_e32 v17, s17
+; GCN-NEXT: v_mov_b32_e32 v18, s18
+; GCN-NEXT: v_mov_b32_e32 v19, s19
+; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v0, s12
-; GCN-NEXT: v_mov_b32_e32 v1, s13
-; GCN-NEXT: v_mov_b32_e32 v2, s14
-; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT: v_mov_b32_e32 v16, s12
+; GCN-NEXT: v_mov_b32_e32 v17, s13
+; GCN-NEXT: v_mov_b32_e32 v18, s14
+; GCN-NEXT: v_mov_b32_e32 v19, s15
+; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NEXT: v_mov_b32_e32 v2, s10
-; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; GCN-NEXT: v_mov_b32_e32 v16, s8
+; GCN-NEXT: v_mov_b32_e32 v17, s9
+; GCN-NEXT: v_mov_b32_e32 v18, s10
+; GCN-NEXT: v_mov_b32_e32 v19, s11
+; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3)
@@ -393,35 +379,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GCN-NEXT: v_accvgpr_write_b32 a0, s8
-; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GCN-NEXT: v_accvgpr_write_b32 a1, s9
-; GCN-NEXT: v_accvgpr_write_b32 a2, s10
-; GCN-NEXT: v_accvgpr_write_b32 a3, s11
-; GCN-NEXT: v_accvgpr_write_b32 a4, s12
-; GCN-NEXT: v_accvgpr_write_b32 a5, s13
-; GCN-NEXT: v_accvgpr_write_b32 a6, s14
-; GCN-NEXT: v_accvgpr_write_b32 a7, s15
-; GCN-NEXT: v_accvgpr_write_b32 a8, s16
-; GCN-NEXT: v_accvgpr_write_b32 a9, s17
-; GCN-NEXT: v_accvgpr_write_b32 a10, s18
-; GCN-NEXT: v_accvgpr_write_b32 a11, s19
-; GCN-NEXT: v_accvgpr_write_b32 a12, s20
-; GCN-NEXT: v_accvgpr_write_b32 a13, s21
-; GCN-NEXT: v_accvgpr_write_b32 a14, s22
-; GCN-NEXT: v_accvgpr_write_b32 a15, s23
+; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
-; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15]
+; GCN-NEXT: v_mov_b32_e32 v16, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 2
-; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
store <16 x float> %result, ptr addrspace(1) %out
@@ -435,40 +413,32 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GCN-NEXT: v_accvgpr_write_b32 a0, s8
-; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GCN-NEXT: v_accvgpr_write_b32 a1, s9
-; GCN-NEXT: v_accvgpr_write_b32 a2, s10
-; GCN-NEXT: v_accvgpr_write_b32 a3, s11
-; GCN-NEXT: v_accvgpr_write_b32 a4, s12
-; GCN-NEXT: v_accvgpr_write_b32 a5, s13
-; GCN-NEXT: v_accvgpr_write_b32 a6, s14
-; GCN-NEXT: v_accvgpr_write_b32 a7, s15
-; GCN-NEXT: v_accvgpr_write_b32 a8, s16
-; GCN-NEXT: v_accvgpr_write_b32 a9, s17
-; GCN-NEXT: v_accvgpr_write_b32 a10, s18
-; GCN-NEXT: v_accvgpr_write_b32 a11, s19
-; GCN-NEXT: v_accvgpr_write_b32 a12, s20
-; GCN-NEXT: v_accvgpr_write_b32 a13, s21
-; GCN-NEXT: v_accvgpr_write_b32 a14, s22
-; GCN-NEXT: v_accvgpr_write_b32 a15, s23
+; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; GCN-NEXT: v_mov_b32_e32 v16, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 2
-; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 3, i32 2, i32 1)
store <16 x float> %result, ptr addrspace(1) %out
ret void
}
-attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
+attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
attributes #1 = { "amdgpu-flat-work-group-size"="1,64" }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 9bdae28f..d81ec1c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -141,20 +141,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
@@ -166,16 +164,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
@@ -183,20 +179,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
@@ -266,20 +260,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
@@ -291,16 +283,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
@@ -308,20 +298,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
@@ -394,9 +382,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], 48
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], 32
-; SDAG-NEXT: v_mov_b64_e32 v[16:17], 16
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -418,42 +406,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b64_e32 v[18:19], 0
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
; SDAG-NEXT: v_mov_b32_e32 v0, s20
; SDAG-NEXT: v_mov_b32_e32 v1, s21
; SDAG-NEXT: v_mov_b32_e32 v2, s22
; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
; SDAG-NEXT: s_nop 4
-; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
; SDAG-NEXT: v_mov_b32_e32 v2, s10
; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -518,9 +506,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 48
-; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 32
-; HEURRC-NEXT: v_mov_b64_e32 v[16:17], 16
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -542,42 +530,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[18:19], 0
-; HEURRC-NEXT: v_mov_b32_e32 v8, s16
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
; HEURRC-NEXT: v_mov_b32_e32 v0, s20
; HEURRC-NEXT: v_mov_b32_e32 v1, s21
; HEURRC-NEXT: v_mov_b32_e32 v2, s22
; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: v_mov_b32_e32 v9, s17
-; HEURRC-NEXT: v_mov_b32_e32 v10, s18
-; HEURRC-NEXT: v_mov_b32_e32 v11, s19
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
; HEURRC-NEXT: s_nop 4
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
; HEURRC-NEXT: v_mov_b32_e32 v2, s10
; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -585,9 +573,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 48
-; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 32
-; VGPRRC-NEXT: v_mov_b64_e32 v[48:49], 16
+; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
@@ -601,43 +589,43 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[50:51], 0
-; VGPRRC-NEXT: v_mov_b32_e32 v40, s16
+; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT: v_mov_b32_e32 v48, s16
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15]
-; VGPRRC-NEXT: v_mov_b32_e32 v41, s17
-; VGPRRC-NEXT: v_mov_b32_e32 v42, s18
-; VGPRRC-NEXT: v_mov_b32_e32 v43, s19
+; VGPRRC-NEXT: v_mov_b32_e32 v49, s17
+; VGPRRC-NEXT: v_mov_b32_e32 v50, s18
+; VGPRRC-NEXT: v_mov_b32_e32 v51, s19
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 0
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[40:43], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16:
@@ -776,9 +764,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], 48
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], 32
-; SDAG-NEXT: v_mov_b64_e32 v[16:17], 16
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -800,42 +788,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b64_e32 v[18:19], 0
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
; SDAG-NEXT: v_mov_b32_e32 v0, s20
; SDAG-NEXT: v_mov_b32_e32 v1, s21
; SDAG-NEXT: v_mov_b32_e32 v2, s22
; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
; SDAG-NEXT: s_nop 4
-; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
; SDAG-NEXT: v_mov_b32_e32 v2, s10
; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -900,9 +888,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 48
-; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 32
-; HEURRC-NEXT: v_mov_b64_e32 v[16:17], 16
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -924,42 +912,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[18:19], 0
-; HEURRC-NEXT: v_mov_b32_e32 v8, s16
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
; HEURRC-NEXT: v_mov_b32_e32 v0, s20
; HEURRC-NEXT: v_mov_b32_e32 v1, s21
; HEURRC-NEXT: v_mov_b32_e32 v2, s22
; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: v_mov_b32_e32 v9, s17
-; HEURRC-NEXT: v_mov_b32_e32 v10, s18
-; HEURRC-NEXT: v_mov_b32_e32 v11, s19
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
; HEURRC-NEXT: s_nop 4
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
; HEURRC-NEXT: v_mov_b32_e32 v2, s10
; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -967,9 +955,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 48
-; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 32
-; VGPRRC-NEXT: v_mov_b64_e32 v[48:49], 16
+; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
@@ -983,43 +971,43 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[50:51], 0
-; VGPRRC-NEXT: v_mov_b32_e32 v40, s16
+; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT: v_mov_b32_e32 v48, s16
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
-; VGPRRC-NEXT: v_mov_b32_e32 v41, s17
-; VGPRRC-NEXT: v_mov_b32_e32 v42, s18
-; VGPRRC-NEXT: v_mov_b32_e32 v43, s19
+; VGPRRC-NEXT: v_mov_b32_e32 v49, s17
+; VGPRRC-NEXT: v_mov_b32_e32 v50, s18
+; VGPRRC-NEXT: v_mov_b32_e32 v51, s19
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 0
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[40:43], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16__flags:
@@ -1505,62 +1493,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: v_mov_b32_e32 v44, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: v_mov_b32_e32 v9, s21
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31]
-; SDAG-NEXT: v_mov_b32_e32 v10, s22
-; SDAG-NEXT: v_mov_b32_e32 v11, s23
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; SDAG-NEXT: v_mov_b32_e32 v40, s20
+; SDAG-NEXT: v_mov_b32_e32 v41, s21
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
+; SDAG-NEXT: v_mov_b32_e32 v42, s22
+; SDAG-NEXT: v_mov_b32_e32 v43, s23
+; SDAG-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: s_nop 2
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -1569,52 +1550,44 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT: v_mov_b32_e32 v24, 0
+; GISEL-NEXT: v_mov_b32_e32 v56, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15]
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -1623,62 +1596,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v12, 0
+; HEURRC-NEXT: v_mov_b32_e32 v44, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
-; HEURRC-NEXT: v_mov_b32_e32 v8, s20
-; HEURRC-NEXT: v_mov_b32_e32 v9, s21
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31]
-; HEURRC-NEXT: v_mov_b32_e32 v10, s22
-; HEURRC-NEXT: v_mov_b32_e32 v11, s23
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
-; HEURRC-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; HEURRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; HEURRC-NEXT: v_mov_b32_e32 v40, s20
+; HEURRC-NEXT: v_mov_b32_e32 v41, s21
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
+; HEURRC-NEXT: v_mov_b32_e32 v42, s22
+; HEURRC-NEXT: v_mov_b32_e32 v43, s23
+; HEURRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: s_nop 2
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s12
-; HEURRC-NEXT: v_mov_b32_e32 v1, s13
-; HEURRC-NEXT: v_mov_b32_e32 v2, s14
-; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s12
+; HEURRC-NEXT: v_mov_b32_e32 v17, s13
+; HEURRC-NEXT: v_mov_b32_e32 v18, s14
+; HEURRC-NEXT: v_mov_b32_e32 v19, s15
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s8
+; HEURRC-NEXT: v_mov_b32_e32 v17, s9
+; HEURRC-NEXT: v_mov_b32_e32 v18, s10
+; HEURRC-NEXT: v_mov_b32_e32 v19, s11
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -1869,62 +1835,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: v_mov_b32_e32 v44, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: v_mov_b32_e32 v9, s21
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT: v_mov_b32_e32 v10, s22
-; SDAG-NEXT: v_mov_b32_e32 v11, s23
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; SDAG-NEXT: v_mov_b32_e32 v40, s20
+; SDAG-NEXT: v_mov_b32_e32 v41, s21
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; SDAG-NEXT: v_mov_b32_e32 v42, s22
+; SDAG-NEXT: v_mov_b32_e32 v43, s23
+; SDAG-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: s_nop 2
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -1933,52 +1892,44 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT: v_mov_b32_e32 v24, 0
+; GISEL-NEXT: v_mov_b32_e32 v56, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -1987,62 +1938,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v12, 0
+; HEURRC-NEXT: v_mov_b32_e32 v44, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
-; HEURRC-NEXT: v_mov_b32_e32 v8, s20
-; HEURRC-NEXT: v_mov_b32_e32 v9, s21
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT: v_mov_b32_e32 v10, s22
-; HEURRC-NEXT: v_mov_b32_e32 v11, s23
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
-; HEURRC-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; HEURRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; HEURRC-NEXT: v_mov_b32_e32 v40, s20
+; HEURRC-NEXT: v_mov_b32_e32 v41, s21
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; HEURRC-NEXT: v_mov_b32_e32 v42, s22
+; HEURRC-NEXT: v_mov_b32_e32 v43, s23
+; HEURRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: s_nop 2
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s12
-; HEURRC-NEXT: v_mov_b32_e32 v1, s13
-; HEURRC-NEXT: v_mov_b32_e32 v2, s14
-; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s12
+; HEURRC-NEXT: v_mov_b32_e32 v17, s13
+; HEURRC-NEXT: v_mov_b32_e32 v18, s14
+; HEURRC-NEXT: v_mov_b32_e32 v19, s15
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s8
+; HEURRC-NEXT: v_mov_b32_e32 v17, s9
+; HEURRC-NEXT: v_mov_b32_e32 v18, s10
+; HEURRC-NEXT: v_mov_b32_e32 v19, s11
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -2234,35 +2178,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
@@ -2271,35 +2207,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
@@ -2308,35 +2236,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
+; HEURRC-NEXT: v_mov_b32_e32 v16, 0
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 2
-; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
@@ -2443,35 +2363,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
@@ -2480,35 +2392,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
@@ -2517,35 +2421,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mov_b32_e32 v16, 0
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 2
-; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
@@ -2781,7 +2677,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
@@ -2791,14 +2687,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; SDAG-NEXT: v_mov_b32_e32 v5, s13
; SDAG-NEXT: v_mov_b32_e32 v6, s14
; SDAG-NEXT: v_mov_b32_e32 v7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b32_e32 v8, s0
+; SDAG-NEXT: v_mov_b32_e32 v9, s1
+; SDAG-NEXT: v_mov_b32_e32 v10, s2
+; SDAG-NEXT: v_mov_b32_e32 v11, s3
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; SDAG-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
@@ -2810,16 +2706,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
@@ -2827,7 +2721,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
@@ -2837,14 +2731,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; HEURRC-NEXT: v_mov_b32_e32 v5, s13
; HEURRC-NEXT: v_mov_b32_e32 v6, s14
; HEURRC-NEXT: v_mov_b32_e32 v7, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b32_e32 v8, s0
+; HEURRC-NEXT: v_mov_b32_e32 v9, s1
+; HEURRC-NEXT: v_mov_b32_e32 v10, s2
+; HEURRC-NEXT: v_mov_b32_e32 v11, s3
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
@@ -2930,7 +2824,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
@@ -2940,14 +2834,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; SDAG-NEXT: v_mov_b32_e32 v5, s13
; SDAG-NEXT: v_mov_b32_e32 v6, s14
; SDAG-NEXT: v_mov_b32_e32 v7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b32_e32 v8, s0
+; SDAG-NEXT: v_mov_b32_e32 v9, s1
+; SDAG-NEXT: v_mov_b32_e32 v10, s2
+; SDAG-NEXT: v_mov_b32_e32 v11, s3
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
@@ -2959,16 +2853,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
@@ -2976,7 +2868,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
@@ -2986,14 +2878,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; HEURRC-NEXT: v_mov_b32_e32 v5, s13
; HEURRC-NEXT: v_mov_b32_e32 v6, s14
; HEURRC-NEXT: v_mov_b32_e32 v7, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b32_e32 v8, s0
+; HEURRC-NEXT: v_mov_b32_e32 v9, s1
+; HEURRC-NEXT: v_mov_b32_e32 v10, s2
+; HEURRC-NEXT: v_mov_b32_e32 v11, s3
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
@@ -3084,19 +2976,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s24
-; SDAG-NEXT: v_mov_b32_e32 v1, s25
-; SDAG-NEXT: v_mov_b32_e32 v2, s26
-; SDAG-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-NEXT: v_mov_b32_e32 v8, s24
+; SDAG-NEXT: v_mov_b32_e32 v9, s25
+; SDAG-NEXT: v_mov_b32_e32 v10, s26
+; SDAG-NEXT: v_mov_b32_e32 v11, s27
; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b32_e32 v4, s28
-; SDAG-NEXT: v_mov_b32_e32 v5, s29
-; SDAG-NEXT: v_mov_b32_e32 v6, s30
-; SDAG-NEXT: v_mov_b32_e32 v7, s31
+; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v13, s29
+; SDAG-NEXT: v_mov_b32_e32 v14, s30
+; SDAG-NEXT: v_mov_b32_e32 v15, s31
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
@@ -3112,44 +3004,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15]
+; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b32_e32 v9, s17
+; SDAG-NEXT: v_mov_b32_e32 v10, s18
+; SDAG-NEXT: v_mov_b32_e32 v11, s19
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_mov_b32_e32 v9, s21
+; SDAG-NEXT: v_mov_b32_e32 v10, s22
+; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -3214,19 +3104,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48
-; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32
-; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], 16
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s24
-; HEURRC-NEXT: v_mov_b32_e32 v1, s25
-; HEURRC-NEXT: v_mov_b32_e32 v2, s26
-; HEURRC-NEXT: v_mov_b32_e32 v3, s27
+; HEURRC-NEXT: v_mov_b32_e32 v8, s24
+; HEURRC-NEXT: v_mov_b32_e32 v9, s25
+; HEURRC-NEXT: v_mov_b32_e32 v10, s26
+; HEURRC-NEXT: v_mov_b32_e32 v11, s27
; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v4, s28
-; HEURRC-NEXT: v_mov_b32_e32 v5, s29
-; HEURRC-NEXT: v_mov_b32_e32 v6, s30
-; HEURRC-NEXT: v_mov_b32_e32 v7, s31
+; HEURRC-NEXT: v_mov_b32_e32 v12, s28
+; HEURRC-NEXT: v_mov_b32_e32 v13, s29
+; HEURRC-NEXT: v_mov_b32_e32 v14, s30
+; HEURRC-NEXT: v_mov_b32_e32 v15, s31
; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
@@ -3242,44 +3132,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], 0
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15]
+; HEURRC-NEXT: v_mov_b32_e32 v8, s16
+; HEURRC-NEXT: v_mov_b32_e32 v9, s17
+; HEURRC-NEXT: v_mov_b32_e32 v10, s18
+; HEURRC-NEXT: v_mov_b32_e32 v11, s19
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v2, s10
+; HEURRC-NEXT: v_mov_b32_e32 v3, s11
+; HEURRC-NEXT: v_mov_b32_e32 v8, s20
+; HEURRC-NEXT: v_mov_b32_e32 v9, s21
+; HEURRC-NEXT: v_mov_b32_e32 v10, s22
+; HEURRC-NEXT: v_mov_b32_e32 v11, s23
+; HEURRC-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -3287,19 +3175,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48
-; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32
-; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16
+; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v32, s24
-; VGPRRC-NEXT: v_mov_b32_e32 v33, s25
-; VGPRRC-NEXT: v_mov_b32_e32 v34, s26
-; VGPRRC-NEXT: v_mov_b32_e32 v35, s27
+; VGPRRC-NEXT: v_mov_b32_e32 v40, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v41, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v42, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v43, s27
; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT: v_mov_b32_e32 v36, s28
-; VGPRRC-NEXT: v_mov_b32_e32 v37, s29
-; VGPRRC-NEXT: v_mov_b32_e32 v38, s30
-; VGPRRC-NEXT: v_mov_b32_e32 v39, s31
+; VGPRRC-NEXT: v_mov_b32_e32 v44, s28
+; VGPRRC-NEXT: v_mov_b32_e32 v45, s29
+; VGPRRC-NEXT: v_mov_b32_e32 v46, s30
+; VGPRRC-NEXT: v_mov_b32_e32 v47, s31
; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
@@ -3307,45 +3195,45 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0
; VGPRRC-NEXT: s_nop 0
-; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15]
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[40:43], v[44:47], v[0:15]
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 3
-; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v0, s16
; VGPRRC-NEXT: v_mov_b32_e32 v1, s17
; VGPRRC-NEXT: v_mov_b32_e32 v2, s18
; VGPRRC-NEXT: v_mov_b32_e32 v3, s19
-; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8:
@@ -3496,19 +3384,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s24
-; SDAG-NEXT: v_mov_b32_e32 v1, s25
-; SDAG-NEXT: v_mov_b32_e32 v2, s26
-; SDAG-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-NEXT: v_mov_b32_e32 v8, s24
+; SDAG-NEXT: v_mov_b32_e32 v9, s25
+; SDAG-NEXT: v_mov_b32_e32 v10, s26
+; SDAG-NEXT: v_mov_b32_e32 v11, s27
; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b32_e32 v4, s28
-; SDAG-NEXT: v_mov_b32_e32 v5, s29
-; SDAG-NEXT: v_mov_b32_e32 v6, s30
-; SDAG-NEXT: v_mov_b32_e32 v7, s31
+; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v13, s29
+; SDAG-NEXT: v_mov_b32_e32 v14, s30
+; SDAG-NEXT: v_mov_b32_e32 v15, s31
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
@@ -3524,44 +3412,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
+; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b32_e32 v9, s17
+; SDAG-NEXT: v_mov_b32_e32 v10, s18
+; SDAG-NEXT: v_mov_b32_e32 v11, s19
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_mov_b32_e32 v9, s21
+; SDAG-NEXT: v_mov_b32_e32 v10, s22
+; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -3626,19 +3512,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48
-; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32
-; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], 16
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s24
-; HEURRC-NEXT: v_mov_b32_e32 v1, s25
-; HEURRC-NEXT: v_mov_b32_e32 v2, s26
-; HEURRC-NEXT: v_mov_b32_e32 v3, s27
+; HEURRC-NEXT: v_mov_b32_e32 v8, s24
+; HEURRC-NEXT: v_mov_b32_e32 v9, s25
+; HEURRC-NEXT: v_mov_b32_e32 v10, s26
+; HEURRC-NEXT: v_mov_b32_e32 v11, s27
; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v4, s28
-; HEURRC-NEXT: v_mov_b32_e32 v5, s29
-; HEURRC-NEXT: v_mov_b32_e32 v6, s30
-; HEURRC-NEXT: v_mov_b32_e32 v7, s31
+; HEURRC-NEXT: v_mov_b32_e32 v12, s28
+; HEURRC-NEXT: v_mov_b32_e32 v13, s29
+; HEURRC-NEXT: v_mov_b32_e32 v14, s30
+; HEURRC-NEXT: v_mov_b32_e32 v15, s31
; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
@@ -3654,44 +3540,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], 0
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
+; HEURRC-NEXT: v_mov_b32_e32 v8, s16
+; HEURRC-NEXT: v_mov_b32_e32 v9, s17
+; HEURRC-NEXT: v_mov_b32_e32 v10, s18
+; HEURRC-NEXT: v_mov_b32_e32 v11, s19
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v2, s10
+; HEURRC-NEXT: v_mov_b32_e32 v3, s11
+; HEURRC-NEXT: v_mov_b32_e32 v8, s20
+; HEURRC-NEXT: v_mov_b32_e32 v9, s21
+; HEURRC-NEXT: v_mov_b32_e32 v10, s22
+; HEURRC-NEXT: v_mov_b32_e32 v11, s23
+; HEURRC-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -3699,19 +3583,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48
-; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32
-; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16
+; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v32, s24
-; VGPRRC-NEXT: v_mov_b32_e32 v33, s25
-; VGPRRC-NEXT: v_mov_b32_e32 v34, s26
-; VGPRRC-NEXT: v_mov_b32_e32 v35, s27
+; VGPRRC-NEXT: v_mov_b32_e32 v40, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v41, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v42, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v43, s27
; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT: v_mov_b32_e32 v36, s28
-; VGPRRC-NEXT: v_mov_b32_e32 v37, s29
-; VGPRRC-NEXT: v_mov_b32_e32 v38, s30
-; VGPRRC-NEXT: v_mov_b32_e32 v39, s31
+; VGPRRC-NEXT: v_mov_b32_e32 v44, s28
+; VGPRRC-NEXT: v_mov_b32_e32 v45, s29
+; VGPRRC-NEXT: v_mov_b32_e32 v46, s30
+; VGPRRC-NEXT: v_mov_b32_e32 v47, s31
; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
@@ -3719,45 +3603,45 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0
; VGPRRC-NEXT: s_nop 0
-; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[40:43], v[44:47], v[0:15] cbsz:2 abid:3 blgp:1
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 3
-; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v0, s16
; VGPRRC-NEXT: v_mov_b32_e32 v1, s17
; VGPRRC-NEXT: v_mov_b32_e32 v2, s18
; VGPRRC-NEXT: v_mov_b32_e32 v3, s19
-; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__flags:
@@ -4254,70 +4138,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v40, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b32_e32 v32, s20
+; SDAG-NEXT: v_mov_b32_e32 v33, s21
+; SDAG-NEXT: v_mov_b32_e32 v34, s22
+; SDAG-NEXT: v_mov_b32_e32 v35, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b32_e32 v4, s24
-; SDAG-NEXT: v_mov_b32_e32 v5, s25
-; SDAG-NEXT: v_mov_b32_e32 v6, s26
-; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_mov_b32_e32 v36, s24
+; SDAG-NEXT: v_mov_b32_e32 v37, s25
+; SDAG-NEXT: v_mov_b32_e32 v38, s26
+; SDAG-NEXT: v_mov_b32_e32 v39, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
+; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31]
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -4326,52 +4203,44 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT: v_mov_b32_e32 v24, 0
+; GISEL-NEXT: v_mov_b32_e32 v56, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15]
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -4379,70 +4248,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v40, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: v_mov_b32_e32 v32, s20
+; HEURRC-NEXT: v_mov_b32_e32 v33, s21
+; HEURRC-NEXT: v_mov_b32_e32 v34, s22
+; HEURRC-NEXT: v_mov_b32_e32 v35, s23
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b32_e32 v4, s24
-; HEURRC-NEXT: v_mov_b32_e32 v5, s25
-; HEURRC-NEXT: v_mov_b32_e32 v6, s26
-; HEURRC-NEXT: v_mov_b32_e32 v7, s27
+; HEURRC-NEXT: v_mov_b32_e32 v36, s24
+; HEURRC-NEXT: v_mov_b32_e32 v37, s25
+; HEURRC-NEXT: v_mov_b32_e32 v38, s26
+; HEURRC-NEXT: v_mov_b32_e32 v39, s27
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
-; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
+; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31]
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
+; HEURRC-NEXT: s_nop 6
+; HEURRC-NEXT: v_mov_b32_e32 v16, s20
+; HEURRC-NEXT: v_mov_b32_e32 v17, s21
+; HEURRC-NEXT: v_mov_b32_e32 v18, s22
+; HEURRC-NEXT: v_mov_b32_e32 v19, s23
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s12
-; HEURRC-NEXT: v_mov_b32_e32 v1, s13
-; HEURRC-NEXT: v_mov_b32_e32 v2, s14
-; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s12
+; HEURRC-NEXT: v_mov_b32_e32 v17, s13
+; HEURRC-NEXT: v_mov_b32_e32 v18, s14
+; HEURRC-NEXT: v_mov_b32_e32 v19, s15
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s8
+; HEURRC-NEXT: v_mov_b32_e32 v17, s9
+; HEURRC-NEXT: v_mov_b32_e32 v18, s10
+; HEURRC-NEXT: v_mov_b32_e32 v19, s11
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -4653,70 +4515,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v40, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b32_e32 v32, s20
+; SDAG-NEXT: v_mov_b32_e32 v33, s21
+; SDAG-NEXT: v_mov_b32_e32 v34, s22
+; SDAG-NEXT: v_mov_b32_e32 v35, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b32_e32 v4, s24
-; SDAG-NEXT: v_mov_b32_e32 v5, s25
-; SDAG-NEXT: v_mov_b32_e32 v6, s26
-; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_mov_b32_e32 v36, s24
+; SDAG-NEXT: v_mov_b32_e32 v37, s25
+; SDAG-NEXT: v_mov_b32_e32 v38, s26
+; SDAG-NEXT: v_mov_b32_e32 v39, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
+; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -4725,52 +4580,44 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT: v_mov_b32_e32 v24, 0
+; GISEL-NEXT: v_mov_b32_e32 v56, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -4778,70 +4625,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v40, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: v_mov_b32_e32 v32, s20
+; HEURRC-NEXT: v_mov_b32_e32 v33, s21
+; HEURRC-NEXT: v_mov_b32_e32 v34, s22
+; HEURRC-NEXT: v_mov_b32_e32 v35, s23
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b32_e32 v4, s24
-; HEURRC-NEXT: v_mov_b32_e32 v5, s25
-; HEURRC-NEXT: v_mov_b32_e32 v6, s26
-; HEURRC-NEXT: v_mov_b32_e32 v7, s27
+; HEURRC-NEXT: v_mov_b32_e32 v36, s24
+; HEURRC-NEXT: v_mov_b32_e32 v37, s25
+; HEURRC-NEXT: v_mov_b32_e32 v38, s26
+; HEURRC-NEXT: v_mov_b32_e32 v39, s27
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
-; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
+; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; HEURRC-NEXT: s_nop 6
+; HEURRC-NEXT: v_mov_b32_e32 v16, s20
+; HEURRC-NEXT: v_mov_b32_e32 v17, s21
+; HEURRC-NEXT: v_mov_b32_e32 v18, s22
+; HEURRC-NEXT: v_mov_b32_e32 v19, s23
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s12
-; HEURRC-NEXT: v_mov_b32_e32 v1, s13
-; HEURRC-NEXT: v_mov_b32_e32 v2, s14
-; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s12
+; HEURRC-NEXT: v_mov_b32_e32 v17, s13
+; HEURRC-NEXT: v_mov_b32_e32 v18, s14
+; HEURRC-NEXT: v_mov_b32_e32 v19, s15
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s8
+; HEURRC-NEXT: v_mov_b32_e32 v17, s9
+; HEURRC-NEXT: v_mov_b32_e32 v18, s10
+; HEURRC-NEXT: v_mov_b32_e32 v19, s11
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -5053,41 +4893,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b32_e32 v4, s24
-; SDAG-NEXT: v_mov_b32_e32 v5, s25
-; SDAG-NEXT: v_mov_b32_e32 v6, s26
-; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_mov_b32_e32 v20, s24
+; SDAG-NEXT: v_mov_b32_e32 v21, s25
+; SDAG-NEXT: v_mov_b32_e32 v22, s26
+; SDAG-NEXT: v_mov_b32_e32 v23, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
@@ -5096,35 +4928,27 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
@@ -5132,41 +4956,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: v_mov_b32_e32 v16, s20
+; HEURRC-NEXT: v_mov_b32_e32 v17, s21
+; HEURRC-NEXT: v_mov_b32_e32 v18, s22
+; HEURRC-NEXT: v_mov_b32_e32 v19, s23
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b32_e32 v4, s24
-; HEURRC-NEXT: v_mov_b32_e32 v5, s25
-; HEURRC-NEXT: v_mov_b32_e32 v6, s26
-; HEURRC-NEXT: v_mov_b32_e32 v7, s27
+; HEURRC-NEXT: v_mov_b32_e32 v20, s24
+; HEURRC-NEXT: v_mov_b32_e32 v21, s25
+; HEURRC-NEXT: v_mov_b32_e32 v22, s26
+; HEURRC-NEXT: v_mov_b32_e32 v23, s27
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
+; HEURRC-NEXT: v_mov_b32_e32 v16, 0
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 2
-; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
@@ -5287,41 +5103,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b32_e32 v4, s24
-; SDAG-NEXT: v_mov_b32_e32 v5, s25
-; SDAG-NEXT: v_mov_b32_e32 v6, s26
-; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_mov_b32_e32 v20, s24
+; SDAG-NEXT: v_mov_b32_e32 v21, s25
+; SDAG-NEXT: v_mov_b32_e32 v22, s26
+; SDAG-NEXT: v_mov_b32_e32 v23, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
@@ -5330,35 +5138,27 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
@@ -5366,41 +5166,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: v_mov_b32_e32 v16, s20
+; HEURRC-NEXT: v_mov_b32_e32 v17, s21
+; HEURRC-NEXT: v_mov_b32_e32 v18, s22
+; HEURRC-NEXT: v_mov_b32_e32 v19, s23
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b32_e32 v4, s24
-; HEURRC-NEXT: v_mov_b32_e32 v5, s25
-; HEURRC-NEXT: v_mov_b32_e32 v6, s26
-; HEURRC-NEXT: v_mov_b32_e32 v7, s27
+; HEURRC-NEXT: v_mov_b32_e32 v20, s24
+; HEURRC-NEXT: v_mov_b32_e32 v21, s25
+; HEURRC-NEXT: v_mov_b32_e32 v22, s26
+; HEURRC-NEXT: v_mov_b32_e32 v23, s27
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mov_b32_e32 v16, 0
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 2
-; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
@@ -5651,20 +5443,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GCN-NEXT: v_mov_b32_e32 v8, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GCN-NEXT: v_accvgpr_write_b32 a0, s0
+; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GCN-NEXT: v_accvgpr_write_b32 a1, s1
-; GCN-NEXT: v_accvgpr_write_b32 a2, s2
-; GCN-NEXT: v_accvgpr_write_b32 a3, s3
+; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
; GCN-NEXT: s_nop 7
-; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; GCN-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
@@ -5672,20 +5462,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
@@ -5755,20 +5543,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GCN-NEXT: v_mov_b32_e32 v8, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GCN-NEXT: v_accvgpr_write_b32 a0, s0
+; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GCN-NEXT: v_accvgpr_write_b32 a1, s1
-; GCN-NEXT: v_accvgpr_write_b32 a2, s2
-; GCN-NEXT: v_accvgpr_write_b32 a3, s3
+; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; GCN-NEXT: s_nop 7
-; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; GCN-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
@@ -5776,20 +5562,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
@@ -5853,5 +5637,5 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
ret void
}
-attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
+attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
attributes #1 = { "amdgpu-flat-work-group-size"="1,64" }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index ff305da..78be949 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -3,6 +3,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck -enable-var-scope --check-prefixes=GCN,LIT-SRCC,GFX908,GFX908_A %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A,GFX908_A,GFX90A_42 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,GFX90A_42 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefix=GFX942-VGPR %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
@@ -405,6 +406,63 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, s16
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, s17
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, s18
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, s19
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, s20
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, s21
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s22
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s23
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s25
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s26
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s27
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s28
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s29
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, s30
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, s31
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, s4
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v21, s5
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v22, s6
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v23, s7
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v24, s8
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v25, s9
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v26, s10
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v27, s11
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v28, s12
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, s13
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, s14
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
@@ -618,6 +676,33 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2.0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3)
@@ -719,6 +804,23 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: s_nop 3
; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 3
+; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -934,6 +1036,34 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x2f32:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2.0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x2_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3)
@@ -1039,6 +1169,24 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x4f32:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -1114,19 +1262,19 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v2
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v0
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v1
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s14
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s15
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s14
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s15
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s0
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s1
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s0
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s1
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v6
-; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v2
-; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v3
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s2
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s3
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v0
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v1
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s2
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s3
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
+; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 1
@@ -1254,19 +1402,19 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v2
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v0
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v1
-; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s14
-; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s15
+; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s14
+; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s15
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s0
-; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s1
+; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s0
+; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s1
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v6
-; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v2
-; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v3
-; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s2
-; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s3
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v0
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v1
+; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s2
+; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s3
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
+; LIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 1
@@ -1330,7 +1478,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX90A-LABEL: test_mfma_f32_32x32x4f16:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40
; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0
@@ -1345,8 +1493,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NEXT: v_mov_b32_e32 v2, s0
+; GFX90A-NEXT: v_mov_b32_e32 v3, s1
; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20
; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21
; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22
@@ -1371,27 +1519,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13
; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14
; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
-; GFX90A-NEXT: v_mov_b32_e32 v2, s2
-; GFX90A-NEXT: v_mov_b32_e32 v3, s3
+; GFX90A-NEXT: v_mov_b32_e32 v4, s2
+; GFX90A-NEXT: v_mov_b32_e32 v5, s3
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
-; GFX90A-NEXT: global_store_dwordx4 v4, a[24:27], s[36:37] offset:96
-; GFX90A-NEXT: global_store_dwordx4 v4, a[28:31], s[36:37] offset:112
-; GFX90A-NEXT: global_store_dwordx4 v4, a[16:19], s[36:37] offset:64
-; GFX90A-NEXT: global_store_dwordx4 v4, a[20:23], s[36:37] offset:80
-; GFX90A-NEXT: global_store_dwordx4 v4, a[8:11], s[36:37] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v4, a[12:15], s[36:37] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v4, a[0:3], s[36:37]
-; GFX90A-NEXT: global_store_dwordx4 v4, a[4:7], s[36:37] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[36:37] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[36:37] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[36:37] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[36:37] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[36:37] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[36:37] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[36:37]
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[36:37] offset:16
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_32x32x4f16:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40
; GFX942-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0
@@ -1406,8 +1554,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX942-NEXT: v_accvgpr_write_b32 a2, s18
; GFX942-NEXT: v_accvgpr_write_b32 a3, s19
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s0
-; GFX942-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NEXT: v_mov_b32_e32 v2, s0
+; GFX942-NEXT: v_mov_b32_e32 v3, s1
; GFX942-NEXT: v_accvgpr_write_b32 a4, s20
; GFX942-NEXT: v_accvgpr_write_b32 a5, s21
; GFX942-NEXT: v_accvgpr_write_b32 a6, s22
@@ -1432,22 +1580,83 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX942-NEXT: v_accvgpr_write_b32 a29, s13
; GFX942-NEXT: v_accvgpr_write_b32 a30, s14
; GFX942-NEXT: v_accvgpr_write_b32 a31, s15
-; GFX942-NEXT: v_mov_b32_e32 v2, s2
-; GFX942-NEXT: v_mov_b32_e32 v3, s3
+; GFX942-NEXT: v_mov_b32_e32 v4, s2
+; GFX942-NEXT: v_mov_b32_e32 v5, s3
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 2
-; GFX942-NEXT: global_store_dwordx4 v4, a[24:27], s[36:37] offset:96
-; GFX942-NEXT: global_store_dwordx4 v4, a[28:31], s[36:37] offset:112
-; GFX942-NEXT: global_store_dwordx4 v4, a[16:19], s[36:37] offset:64
-; GFX942-NEXT: global_store_dwordx4 v4, a[20:23], s[36:37] offset:80
-; GFX942-NEXT: global_store_dwordx4 v4, a[8:11], s[36:37] offset:32
-; GFX942-NEXT: global_store_dwordx4 v4, a[12:15], s[36:37] offset:48
-; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[36:37]
-; GFX942-NEXT: global_store_dwordx4 v4, a[4:7], s[36:37] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[36:37] offset:96
+; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[36:37] offset:112
+; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[36:37] offset:64
+; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[36:37] offset:80
+; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[36:37] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[36:37] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[36:37]
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[36:37] offset:16
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x4f16:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s3
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[38:39], 0x0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, s16
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, s17
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, s18
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, s19
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, s0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v35, s1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, s20
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, s21
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s22
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s23
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s25
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s26
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s27
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s28
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s29
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, s30
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, s31
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, s4
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v21, s5
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v22, s6
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v23, s7
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v24, s8
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v25, s9
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v26, s10
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v27, s11
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v28, s12
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, s13
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, s14
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v36, s2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v37, s3
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x4_2b_f16 v[0:31], v[34:35], v[36:37], v[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 2
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[36:37] offset:96
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[36:37] offset:112
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[36:37] offset:64
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[36:37] offset:80
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[8:11], s[36:37] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[12:15], s[36:37] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[0:3], s[36:37]
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[4:7], s[36:37] offset:16
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%c.1 = load <4 x half>, ptr addrspace(1) %c
@@ -1676,6 +1885,36 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x4f16:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s20
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s21
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s22
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s23
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_4b_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%c.1 = load <4 x half>, ptr addrspace(1) %c
@@ -1752,46 +1991,66 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add
; GFX90A-LABEL: test_mfma_f32_4x4x4f16:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: v_mov_b32_e32 v3, s5
; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8
-; GFX90A-NEXT: v_mov_b32_e32 v2, s6
-; GFX90A-NEXT: v_mov_b32_e32 v3, s7
+; GFX90A-NEXT: v_mov_b32_e32 v4, s6
+; GFX90A-NEXT: v_mov_b32_e32 v5, s7
; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9
; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_4x4x4f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 4
-; GFX90A-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_4x4x4f16:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX942-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s4
-; GFX942-NEXT: v_mov_b32_e32 v1, s5
+; GFX942-NEXT: v_mov_b32_e32 v2, s4
+; GFX942-NEXT: v_mov_b32_e32 v3, s5
; GFX942-NEXT: v_accvgpr_write_b32 a0, s8
-; GFX942-NEXT: v_mov_b32_e32 v2, s6
-; GFX942-NEXT: v_mov_b32_e32 v3, s7
+; GFX942-NEXT: v_mov_b32_e32 v4, s6
+; GFX942-NEXT: v_mov_b32_e32 v5, s7
; GFX942-NEXT: v_accvgpr_write_b32 a1, s9
; GFX942-NEXT: v_accvgpr_write_b32 a2, s10
; GFX942-NEXT: v_accvgpr_write_b32 a3, s11
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_4x4x4_16b_f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_4x4x4_16b_f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 4
-; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4f16:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s4
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s5
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s6
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s7
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 4
+; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%c.1 = load <4 x half>, ptr addrspace(1) %c
@@ -2021,6 +2280,36 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8f16:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s20
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s21
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s22
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s23
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%c.1 = load <4 x half>, ptr addrspace(1) %c
@@ -2099,47 +2388,67 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
; GFX90A-LABEL: test_mfma_f32_16x16x16f16:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: v_mov_b32_e32 v3, s5
; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8
-; GFX90A-NEXT: v_mov_b32_e32 v2, s6
-; GFX90A-NEXT: v_mov_b32_e32 v3, s7
+; GFX90A-NEXT: v_mov_b32_e32 v4, s6
+; GFX90A-NEXT: v_mov_b32_e32 v5, s7
; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9
; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
-; GFX90A-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_16x16x16f16:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX942-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s4
-; GFX942-NEXT: v_mov_b32_e32 v1, s5
+; GFX942-NEXT: v_mov_b32_e32 v2, s4
+; GFX942-NEXT: v_mov_b32_e32 v3, s5
; GFX942-NEXT: v_accvgpr_write_b32 a0, s8
-; GFX942-NEXT: v_mov_b32_e32 v2, s6
-; GFX942-NEXT: v_mov_b32_e32 v3, s7
+; GFX942-NEXT: v_mov_b32_e32 v4, s6
+; GFX942-NEXT: v_mov_b32_e32 v5, s7
; GFX942-NEXT: v_accvgpr_write_b32 a1, s9
; GFX942-NEXT: v_accvgpr_write_b32 a2, s10
; GFX942-NEXT: v_accvgpr_write_b32 a3, s11
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 6
-; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16f16:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s4
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s5
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s6
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s7
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 6
+; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%c.1 = load <4 x half>, ptr addrspace(1) %c
@@ -2508,6 +2817,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_i32_32x32x4i8:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, s16
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, s17
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, s18
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, s19
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, s20
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, s21
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s22
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s23
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s25
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s26
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s27
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s28
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s29
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, s30
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, s31
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, s4
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v21, s5
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v22, s6
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v23, s7
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v24, s8
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v25, s9
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v26, s10
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v27, s11
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v28, s12
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, s13
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, s14
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_i32_32x32x4_2b_i8 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 2
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <32 x i32>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 1, i32 2, <32 x i32> %in.1, i32 1, i32 2, i32 3)
@@ -2721,6 +3087,33 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x i32>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3)
@@ -2728,6 +3121,143 @@ bb:
ret void
}
+define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspace(1) %arg) #0 {
+; NOLIT-SRCC-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
+; NOLIT-SRCC: ; %bb.0: ; %bb
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 64
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
+; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
+; NOLIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
+; NOLIT-SRCC-NEXT: s_nop 7
+; NOLIT-SRCC-NEXT: s_nop 1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8
+; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; NOLIT-SRCC-NEXT: s_endpgm
+;
+; LIT-SRCC-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
+; LIT-SRCC: ; %bb.0: ; %bb
+; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
+; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
+; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
+; LIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
+; LIT-SRCC-NEXT: s_nop 7
+; LIT-SRCC-NEXT: s_nop 1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8
+; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; LIT-SRCC-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1
+; GFX90A-NEXT: v_mov_b32_e32 v1, 2
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
+; GFX942: ; %bb.0: ; %bb
+; GFX942-NEXT: v_mov_b32_e32 v0, 1
+; GFX942-NEXT: v_mov_b32_e32 v1, 2
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
+bb:
+ %in.1 = load <16 x i32>, ptr addrspace(1) %arg
+ %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 1, i32 2, <16 x i32> splat (i32 64), i32 1, i32 2, i32 3)
+ store <16 x i32> %mai.1, ptr addrspace(1) %arg
+ ret void
+}
+
define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8:
; NOLIT-SRCC: ; %bb.0: ; %bb
@@ -2822,6 +3352,23 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: s_nop 4
; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 4
+; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <4 x i32>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3)
@@ -2829,6 +3376,197 @@ bb:
ret void
}
+define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_imm_src2_1(ptr addrspace(1) %arg) #0 {
+; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
+; NOLIT-SRCC: ; %bb.0: ; %bb
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 1
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1
+; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
+; NOLIT-SRCC-NEXT: s_nop 0
+; NOLIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
+; NOLIT-SRCC-NEXT: s_nop 3
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
+; NOLIT-SRCC-NEXT: s_nop 0
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; NOLIT-SRCC-NEXT: s_endpgm
+;
+; LIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
+; LIT-SRCC: ; %bb.0: ; %bb
+; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
+; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
+; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
+; LIT-SRCC-NEXT: s_nop 0
+; LIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, 1 cbsz:1 abid:2 blgp:3
+; LIT-SRCC-NEXT: s_nop 3
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
+; LIT-SRCC-NEXT: s_nop 0
+; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; LIT-SRCC-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v2, 2
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v2, 1 cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_nop 3
+; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
+; GFX942: ; %bb.0: ; %bb
+; GFX942-NEXT: v_mov_b32_e32 v0, 1
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 a[0:3], v0, v2, 1 cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v0, v1, 1 cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 3
+; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
+bb:
+ %in.1 = load <4 x i32>, ptr addrspace(1) %arg
+ %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> splat (i32 1), i32 1, i32 2, i32 3)
+ store <4 x i32> %mai.1, ptr addrspace(1) %arg
+ ret void
+}
+
+define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_k_src2_1(ptr addrspace(1) %arg) #0 {
+; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
+; NOLIT-SRCC: ; %bb.0:
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 0x41
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 1
+; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v0
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v0
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v0
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 2
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
+; NOLIT-SRCC-NEXT: s_nop 0
+; NOLIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v1, v0, a[0:3] cbsz:1 abid:2 blgp:3
+; NOLIT-SRCC-NEXT: s_nop 3
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
+; NOLIT-SRCC-NEXT: s_nop 0
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; NOLIT-SRCC-NEXT: s_endpgm
+;
+; LIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
+; LIT-SRCC: ; %bb.0:
+; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 0x41
+; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 1
+; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v0
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v0
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v0
+; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 2
+; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
+; LIT-SRCC-NEXT: s_nop 0
+; LIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v1, v0, a[0:3] cbsz:1 abid:2 blgp:3
+; LIT-SRCC-NEXT: s_nop 3
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
+; LIT-SRCC-NEXT: s_nop 0
+; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; LIT-SRCC-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, 1
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 2
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v1, v2, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_nop 3
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_mov_b32_e32 v1, 0x41
+; GFX942-NEXT: v_accvgpr_write_b32 a0, v1
+; GFX942-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
+; GFX942-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 a[0:3], v1, v2, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
+; GFX942-VGPR: ; %bb.0:
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x41
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v5, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 3
+; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
+ %in.1 = load <4 x i32>, ptr addrspace(1) %arg
+ %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> splat (i32 65), i32 1, i32 2, i32 3)
+ store <4 x i32> %mai.1, ptr addrspace(1) %arg
+ ret void
+}
+
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_forward_acc:
; NOLIT-SRCC: ; %bb.0: ; %bb
@@ -3219,6 +3957,64 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_forward_acc:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 2.0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, s16
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, s17
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, s18
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, s19
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, s20
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, s21
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s22
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s23
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s25
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s26
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s27
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s28
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s29
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, s30
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, s31
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, s4
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v21, s5
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v22, s6
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v23, s7
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v24, s8
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v25, s9
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v26, s10
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v27, s11
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v28, s12
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, s13
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, s14
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -3435,6 +4231,34 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_forward_acc:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2.0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15]
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 0, i32 0, i32 0)
@@ -3542,6 +4366,25 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %
; GFX942-NEXT: s_nop 3
; GFX942-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_forward_acc:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3]
+; GFX942-VGPR-NEXT: s_nop 3
+; GFX942-VGPR-NEXT: global_store_dwordx4 v6, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 0, i32 0, i32 0)
@@ -3616,6 +4459,19 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm_splat(ptr addrspace(1) %ar
; GFX942-NEXT: s_nop 2
; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_imm_splat:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, 1.0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 2
+; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
store <4 x float> %mai.1, ptr addrspace(1) %arg
@@ -3745,6 +4601,22 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_imm_splat:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v1, 1.0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
store <16 x float> %mai.1, ptr addrspace(1) %arg
@@ -3885,6 +4757,24 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8f16_imm_splat:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x3c003c00
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, 0x40004000
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[0:1], v[2:3], 1.0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
store <16 x float> %mai.1, ptr addrspace(1) %arg
@@ -4091,6 +4981,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_imm_splat:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v0, v1, 0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
store <32 x float> %mai.1, ptr addrspace(1) %arg
@@ -4175,6 +5086,21 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0
; GFX942-NEXT: s_nop 2
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_imm:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[0:3]
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 2
+; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 2.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
store <4 x float> %mai.1, ptr addrspace(1) %arg
@@ -4355,6 +5281,36 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_imm:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, v0
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v15, v[0:15]
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 2.0>, i32 0, i32 0, i32 0)
store <16 x float> %mai.1, ptr addrspace(1) %arg
@@ -4667,6 +5623,74 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_imm:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v21, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v22, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v23, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v24, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v25, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v26, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v27, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v28, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, v1
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[32:33], v[30:31]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 2.0
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[30:31], v[28:29]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[28:29], v[26:27]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[26:27], v[24:25]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[24:25], v[22:23]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[22:23], v[20:21]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[20:21], v[18:19]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[18:19], v[16:17]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[16:17], v[14:15]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], v[12:13]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], v[10:11]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], v[8:9]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33]
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[30:33], s[0:1] offset:112
+; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[26:29], s[0:1] offset:96
+; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[22:25], s[0:1] offset:80
+; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[18:21], s[0:1] offset:64
+; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[14:17], s[0:1] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[10:13], s[0:1] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[6:9], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
store <32 x float> %mai.1, ptr addrspace(1) %arg
@@ -4755,6 +5779,24 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %ar
; GFX942-NEXT: s_nop 2
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1.0
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-VGPR-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x42f60000
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3]
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 2
+; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid
@@ -4846,6 +5888,23 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(ptr addrspa
; GFX942-NEXT: s_nop 2
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1.0
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x42f60000
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3]
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 2
+; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid
@@ -5109,6 +6168,37 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_vecarg:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-VGPR-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 2.0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX942-VGPR-NEXT: s_waitcnt vmcnt(0)
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %arg, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
index 04ee0bb..f78ea92 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -1485,30 +1485,30 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v12, s0
-; SDAG-NEXT: v_mov_b32_e32 v13, s1
-; SDAG-NEXT: v_mov_b32_e32 v14, s2
-; SDAG-NEXT: v_mov_b32_e32 v15, s3
-; SDAG-NEXT: v_mov_b32_e32 v16, s16
-; SDAG-NEXT: v_mov_b32_e32 v17, s17
-; SDAG-NEXT: v_mov_b32_e32 v18, s18
-; SDAG-NEXT: v_mov_b32_e32 v19, s19
-; SDAG-NEXT: v_mov_b32_e32 v20, s28
-; SDAG-NEXT: v_mov_b32_e32 v21, s29
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
-; SDAG-NEXT: v_mov_b32_e32 v8, s24
-; SDAG-NEXT: v_mov_b32_e32 v9, s25
-; SDAG-NEXT: v_mov_b32_e32 v10, s26
-; SDAG-NEXT: v_mov_b32_e32 v11, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v21
+; SDAG-NEXT: v_mov_b32_e32 v14, s0
+; SDAG-NEXT: v_mov_b32_e32 v15, s1
+; SDAG-NEXT: v_mov_b32_e32 v16, s2
+; SDAG-NEXT: v_mov_b32_e32 v17, s3
+; SDAG-NEXT: v_mov_b32_e32 v18, s16
+; SDAG-NEXT: v_mov_b32_e32 v19, s17
+; SDAG-NEXT: v_mov_b32_e32 v20, s18
+; SDAG-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s28
+; SDAG-NEXT: v_mov_b32_e32 v5, s29
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s24
+; SDAG-NEXT: v_mov_b32_e32 v11, s25
+; SDAG-NEXT: v_mov_b32_e32 v12, s26
+; SDAG-NEXT: v_mov_b32_e32 v13, s27
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v4
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v5
; SDAG-NEXT: v_accvgpr_write_b32 a2, v0
; SDAG-NEXT: v_accvgpr_write_b32 a3, v1
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[12:19], v[4:11], a[0:3], v2, v3 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[6:13], a[0:3], v2, v3 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1895,7 +1895,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
@@ -1915,16 +1915,16 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
; SDAG-NEXT: v_mov_b32_e32 v14, s22
; SDAG-NEXT: v_mov_b32_e32 v15, s23
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: v_mov_b32_e32 v21, s13
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s12, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[14:15]
+; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[14:15]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd:
@@ -1937,20 +1937,18 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s24
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s25
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s26
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s27
-; GISEL-NEXT: v_mov_b32_e32 v16, s29
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT: v_mov_b32_e32 v20, s29
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s28, v20 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[30:31]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -1964,7 +1962,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
; SDAG-NEXT: s_movk_i32 s6, 0x41
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
@@ -1974,7 +1972,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b32_e32 v5, s13
; SDAG-NEXT: v_mov_b32_e32 v6, s14
; SDAG-NEXT: v_mov_b32_e32 v7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
; SDAG-NEXT: v_mov_b32_e32 v8, s16
; SDAG-NEXT: v_mov_b32_e32 v9, s17
; SDAG-NEXT: v_mov_b32_e32 v10, s18
@@ -1983,21 +1981,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b32_e32 v13, s21
; SDAG-NEXT: v_mov_b32_e32 v14, s22
; SDAG-NEXT: v_mov_b32_e32 v15, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v20, 0x41
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
@@ -2005,19 +2001,17 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2031,7 +2025,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
; SDAG-NEXT: s_movk_i32 s6, 0x41
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
@@ -2041,7 +2035,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b32_e32 v5, s13
; SDAG-NEXT: v_mov_b32_e32 v6, s14
; SDAG-NEXT: v_mov_b32_e32 v7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
; SDAG-NEXT: v_mov_b32_e32 v8, s16
; SDAG-NEXT: v_mov_b32_e32 v9, s17
; SDAG-NEXT: v_mov_b32_e32 v10, s18
@@ -2050,21 +2044,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b32_e32 v13, s21
; SDAG-NEXT: v_mov_b32_e32 v14, s22
; SDAG-NEXT: v_mov_b32_e32 v15, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v20, 0x41
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
@@ -2072,19 +2064,17 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2096,7 +2086,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v0, s8
@@ -2107,7 +2097,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b32_e32 v5, s13
; SDAG-NEXT: v_mov_b32_e32 v6, s14
; SDAG-NEXT: v_mov_b32_e32 v7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
; SDAG-NEXT: v_mov_b32_e32 v8, s16
; SDAG-NEXT: v_mov_b32_e32 v9, s17
; SDAG-NEXT: v_mov_b32_e32 v10, s18
@@ -2116,14 +2106,12 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b32_e32 v13, s21
; SDAG-NEXT: v_mov_b32_e32 v14, s22
; SDAG-NEXT: v_mov_b32_e32 v15, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
@@ -2136,21 +2124,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2162,7 +2148,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v0, s8
@@ -2173,7 +2159,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b32_e32 v5, s13
; SDAG-NEXT: v_mov_b32_e32 v6, s14
; SDAG-NEXT: v_mov_b32_e32 v7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
; SDAG-NEXT: v_mov_b32_e32 v8, s16
; SDAG-NEXT: v_mov_b32_e32 v9, s17
; SDAG-NEXT: v_mov_b32_e32 v10, s18
@@ -2182,14 +2168,12 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b32_e32 v13, s21
; SDAG-NEXT: v_mov_b32_e32 v14, s22
; SDAG-NEXT: v_mov_b32_e32 v15, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
@@ -2202,21 +2186,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2559,5 +2541,5 @@ declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6
declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
-attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
+attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index 91197f9..0b2818f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -3515,26 +3515,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v24, s0
-; SDAG-NEXT: v_mov_b32_e32 v25, s1
-; SDAG-NEXT: v_mov_b32_e32 v26, s2
-; SDAG-NEXT: v_mov_b32_e32 v27, s3
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
-; SDAG-NEXT: v_mov_b32_e32 v29, s17
-; SDAG-NEXT: v_mov_b32_e32 v30, s18
-; SDAG-NEXT: v_mov_b32_e32 v31, s19
-; SDAG-NEXT: v_mov_b32_e32 v32, s28
-; SDAG-NEXT: v_mov_b32_e32 v33, s29
-; SDAG-NEXT: v_mov_b32_e32 v16, s20
-; SDAG-NEXT: v_mov_b32_e32 v17, s21
-; SDAG-NEXT: v_mov_b32_e32 v18, s22
-; SDAG-NEXT: v_mov_b32_e32 v19, s23
-; SDAG-NEXT: v_mov_b32_e32 v20, s24
-; SDAG-NEXT: v_mov_b32_e32 v21, s25
-; SDAG-NEXT: v_mov_b32_e32 v22, s26
-; SDAG-NEXT: v_mov_b32_e32 v23, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v32
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v33
+; SDAG-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-NEXT: v_mov_b32_e32 v30, s16
+; SDAG-NEXT: v_mov_b32_e32 v31, s17
+; SDAG-NEXT: v_mov_b32_e32 v32, s18
+; SDAG-NEXT: v_mov_b32_e32 v33, s19
+; SDAG-NEXT: v_mov_b32_e32 v16, s28
+; SDAG-NEXT: v_mov_b32_e32 v17, s29
+; SDAG-NEXT: v_mov_b32_e32 v18, s20
+; SDAG-NEXT: v_mov_b32_e32 v19, s21
+; SDAG-NEXT: v_mov_b32_e32 v20, s22
+; SDAG-NEXT: v_mov_b32_e32 v21, s23
+; SDAG-NEXT: v_mov_b32_e32 v22, s24
+; SDAG-NEXT: v_mov_b32_e32 v23, s25
+; SDAG-NEXT: v_mov_b32_e32 v24, s26
+; SDAG-NEXT: v_mov_b32_e32 v25, s27
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v0
; SDAG-NEXT: v_accvgpr_write_b32 a3, v1
; SDAG-NEXT: v_accvgpr_write_b32 a4, v2
@@ -3550,7 +3550,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-NEXT: v_accvgpr_write_b32 a14, v12
; SDAG-NEXT: v_accvgpr_write_b32 a15, v13
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[24:31], v[16:23], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[18:25], a[0:15], v14, v15 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
@@ -3993,34 +3993,34 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v16, s0
-; SDAG-NEXT: v_mov_b32_e32 v17, s1
-; SDAG-NEXT: v_mov_b32_e32 v18, s2
-; SDAG-NEXT: v_mov_b32_e32 v19, s3
-; SDAG-NEXT: v_mov_b32_e32 v20, s16
-; SDAG-NEXT: v_mov_b32_e32 v21, s17
-; SDAG-NEXT: v_mov_b32_e32 v22, s18
-; SDAG-NEXT: v_mov_b32_e32 v23, s19
-; SDAG-NEXT: v_mov_b32_e32 v24, s20
-; SDAG-NEXT: v_mov_b32_e32 v25, s21
-; SDAG-NEXT: v_mov_b32_e32 v26, s22
-; SDAG-NEXT: v_mov_b32_e32 v27, s23
-; SDAG-NEXT: v_mov_b32_e32 v28, s24
-; SDAG-NEXT: v_mov_b32_e32 v29, s25
-; SDAG-NEXT: v_mov_b32_e32 v30, s26
-; SDAG-NEXT: v_mov_b32_e32 v31, s27
-; SDAG-NEXT: v_mov_b32_e32 v32, s28
-; SDAG-NEXT: v_mov_b32_e32 v33, s29
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v31
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v32
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v33
+; SDAG-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-NEXT: v_mov_b32_e32 v30, s16
+; SDAG-NEXT: v_mov_b32_e32 v31, s17
+; SDAG-NEXT: v_mov_b32_e32 v32, s18
+; SDAG-NEXT: v_mov_b32_e32 v33, s19
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: v_mov_b32_e32 v20, s24
+; SDAG-NEXT: v_mov_b32_e32 v21, s25
+; SDAG-NEXT: v_mov_b32_e32 v22, s26
+; SDAG-NEXT: v_mov_b32_e32 v23, s27
+; SDAG-NEXT: v_mov_b32_e32 v24, s28
+; SDAG-NEXT: v_mov_b32_e32 v25, s29
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
; SDAG-NEXT: v_accvgpr_write_b32 a10, v8
; SDAG-NEXT: v_accvgpr_write_b32 a11, v9
; SDAG-NEXT: v_accvgpr_write_b32 a12, v10
@@ -4028,7 +4028,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
; SDAG-NEXT: v_accvgpr_write_b32 a14, v12
; SDAG-NEXT: v_accvgpr_write_b32 a15, v13
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
@@ -4539,49 +4539,41 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s36
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v4, s12
-; SDAG-NEXT: v_mov_b32_e32 v5, s13
-; SDAG-NEXT: v_mov_b32_e32 v6, s14
-; SDAG-NEXT: v_mov_b32_e32 v7, s15
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
-; SDAG-NEXT: v_mov_b32_e32 v12, s20
-; SDAG-NEXT: v_mov_b32_e32 v13, s21
-; SDAG-NEXT: v_mov_b32_e32 v14, s22
-; SDAG-NEXT: v_mov_b32_e32 v15, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s37
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s38
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s39
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s40
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s41
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s42
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s43
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s44
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s45
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s46
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s47
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s48
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s49
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s50
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s51
-; SDAG-NEXT: v_mov_b32_e32 v16, s1
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: v_mov_b32_e32 v20, s12
+; SDAG-NEXT: v_mov_b32_e32 v21, s13
+; SDAG-NEXT: v_mov_b32_e32 v22, s14
+; SDAG-NEXT: v_mov_b32_e32 v23, s15
+; SDAG-NEXT: v_mov_b32_e32 v24, s16
+; SDAG-NEXT: v_mov_b32_e32 v25, s17
+; SDAG-NEXT: v_mov_b32_e32 v26, s18
+; SDAG-NEXT: v_mov_b32_e32 v27, s19
+; SDAG-NEXT: v_mov_b32_e32 v28, s20
+; SDAG-NEXT: v_mov_b32_e32 v29, s21
+; SDAG-NEXT: v_mov_b32_e32 v30, s22
+; SDAG-NEXT: v_mov_b32_e32 v31, s23
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; SDAG-NEXT: v_mov_b32_e32 v32, s1
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s0, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3]
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd:
@@ -4590,41 +4582,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s36
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s37
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s38
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s39
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s40
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s41
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s42
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s43
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s44
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s45
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s46
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s47
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s48
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s49
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s50
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s51
-; GISEL-NEXT: v_mov_b32_e32 v16, s1
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; GISEL-NEXT: v_mov_b32_e32 v32, s1
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s0, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3]
-; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16
-; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32
-; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
store <16 x float> %result, ptr addrspace(1) %ptr, align 64
@@ -4639,91 +4623,75 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
; SDAG-NEXT: s_movk_i32 s2, 0x41
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v4, s12
-; SDAG-NEXT: v_mov_b32_e32 v5, s13
-; SDAG-NEXT: v_mov_b32_e32 v6, s14
-; SDAG-NEXT: v_mov_b32_e32 v7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s36
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
-; SDAG-NEXT: v_mov_b32_e32 v12, s20
-; SDAG-NEXT: v_mov_b32_e32 v13, s21
-; SDAG-NEXT: v_mov_b32_e32 v14, s22
-; SDAG-NEXT: v_mov_b32_e32 v15, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s37
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s38
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s39
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s40
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s41
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s42
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s43
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s44
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s45
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s46
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s47
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s48
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s49
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s50
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s51
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: v_mov_b32_e32 v20, s12
+; SDAG-NEXT: v_mov_b32_e32 v21, s13
+; SDAG-NEXT: v_mov_b32_e32 v22, s14
+; SDAG-NEXT: v_mov_b32_e32 v23, s15
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; SDAG-NEXT: v_mov_b32_e32 v24, s16
+; SDAG-NEXT: v_mov_b32_e32 v25, s17
+; SDAG-NEXT: v_mov_b32_e32 v26, s18
+; SDAG-NEXT: v_mov_b32_e32 v27, s19
+; SDAG-NEXT: v_mov_b32_e32 v28, s20
+; SDAG-NEXT: v_mov_b32_e32 v29, s21
+; SDAG-NEXT: v_mov_b32_e32 v30, s22
+; SDAG-NEXT: v_mov_b32_e32 v31, s23
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s2, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s2, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v32, 0x41
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s36
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s37
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s38
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s39
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s40
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s41
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s42
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s43
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s44
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s45
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s46
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s47
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s48
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s49
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s50
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s51
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2)
store <16 x float> %result, ptr addrspace(1) %ptr, align 64
@@ -4735,26 +4703,26 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: v_mov_b32_e32 v4, s16
-; SDAG-NEXT: v_mov_b32_e32 v5, s17
-; SDAG-NEXT: v_mov_b32_e32 v6, s18
-; SDAG-NEXT: v_mov_b32_e32 v7, s19
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: v_mov_b32_e32 v9, s21
-; SDAG-NEXT: v_mov_b32_e32 v10, s22
-; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: v_mov_b32_e32 v2, s12
+; SDAG-NEXT: v_mov_b32_e32 v3, s13
+; SDAG-NEXT: v_mov_b32_e32 v4, s14
+; SDAG-NEXT: v_mov_b32_e32 v5, s15
+; SDAG-NEXT: v_mov_b32_e32 v6, s16
+; SDAG-NEXT: v_mov_b32_e32 v7, s17
+; SDAG-NEXT: v_mov_b32_e32 v8, s18
+; SDAG-NEXT: v_mov_b32_e32 v9, s19
+; SDAG-NEXT: v_mov_b32_e32 v10, s20
+; SDAG-NEXT: v_mov_b32_e32 v11, s21
+; SDAG-NEXT: v_mov_b32_e32 v12, s22
+; SDAG-NEXT: v_mov_b32_e32 v13, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
-; SDAG-NEXT: v_mov_b32_e32 v13, s25
-; SDAG-NEXT: v_mov_b32_e32 v14, s26
+; SDAG-NEXT: v_mov_b32_e32 v14, s24
+; SDAG-NEXT: v_mov_b32_e32 v15, s25
+; SDAG-NEXT: v_mov_b32_e32 v16, s26
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b32_e32 v15, s27
+; SDAG-NEXT: v_mov_b32_e32 v17, s27
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
@@ -4770,45 +4738,44 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b32_e32 v16, s1
+; SDAG-NEXT: v_mov_b32_e32 v0, s1
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0]
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], s0, v0 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: v_mov_b32_e32 v3, s21
+; SDAG-NEXT: v_mov_b32_e32 v4, s22
+; SDAG-NEXT: v_mov_b32_e32 v5, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v6, s18
+; SDAG-NEXT: v_mov_b32_e32 v7, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s16
+; SDAG-NEXT: v_mov_b32_e32 v5, s17
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v10, s10
+; SDAG-NEXT: v_mov_b32_e32 v11, s11
+; SDAG-NEXT: v_mov_b32_e32 v8, s8
+; SDAG-NEXT: v_mov_b32_e32 v9, s9
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -4922,42 +4889,41 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: v_mov_b32_e32 v3, s21
+; SDAG-NEXT: v_mov_b32_e32 v4, s22
+; SDAG-NEXT: v_mov_b32_e32 v5, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v6, s18
+; SDAG-NEXT: v_mov_b32_e32 v7, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s16
+; SDAG-NEXT: v_mov_b32_e32 v5, s17
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v10, s10
+; SDAG-NEXT: v_mov_b32_e32 v11, s11
+; SDAG-NEXT: v_mov_b32_e32 v8, s8
+; SDAG-NEXT: v_mov_b32_e32 v9, s9
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -5033,78 +4999,72 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: v_mov_b32_e32 v4, s16
-; SDAG-NEXT: v_mov_b32_e32 v5, s17
-; SDAG-NEXT: v_mov_b32_e32 v6, s18
-; SDAG-NEXT: v_mov_b32_e32 v7, s19
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: v_mov_b32_e32 v9, s21
-; SDAG-NEXT: v_mov_b32_e32 v10, s22
-; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: v_mov_b32_e32 v32, s12
+; SDAG-NEXT: v_mov_b32_e32 v33, s13
+; SDAG-NEXT: v_mov_b32_e32 v34, s14
+; SDAG-NEXT: v_mov_b32_e32 v35, s15
+; SDAG-NEXT: v_mov_b32_e32 v36, s16
+; SDAG-NEXT: v_mov_b32_e32 v37, s17
+; SDAG-NEXT: v_mov_b32_e32 v38, s18
+; SDAG-NEXT: v_mov_b32_e32 v39, s19
+; SDAG-NEXT: v_mov_b32_e32 v40, s20
+; SDAG-NEXT: v_mov_b32_e32 v41, s21
+; SDAG-NEXT: v_mov_b32_e32 v42, s22
+; SDAG-NEXT: v_mov_b32_e32 v43, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
-; SDAG-NEXT: v_mov_b32_e32 v13, s25
-; SDAG-NEXT: v_mov_b32_e32 v14, s26
-; SDAG-NEXT: v_mov_b32_e32 v15, s27
+; SDAG-NEXT: v_mov_b32_e32 v44, s24
+; SDAG-NEXT: v_mov_b32_e32 v45, s25
+; SDAG-NEXT: v_mov_b32_e32 v46, s26
+; SDAG-NEXT: v_mov_b32_e32 v47, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
+; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -5112,61 +5072,45 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
-; GISEL-NEXT: v_accvgpr_write_b32 a31, s23
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
-; GISEL-NEXT: v_accvgpr_write_b32 a30, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a29, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a28, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a27, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a26, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a25, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a24, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a23, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a22, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a21, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a20, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a19, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a18, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a17, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a16, s8
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0)
@@ -5180,78 +5124,70 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: v_mov_b32_e32 v4, s16
-; SDAG-NEXT: v_mov_b32_e32 v5, s17
-; SDAG-NEXT: v_mov_b32_e32 v6, s18
-; SDAG-NEXT: v_mov_b32_e32 v7, s19
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: v_mov_b32_e32 v9, s21
-; SDAG-NEXT: v_mov_b32_e32 v10, s22
-; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: v_mov_b32_e32 v20, s16
+; SDAG-NEXT: v_mov_b32_e32 v21, s17
+; SDAG-NEXT: v_mov_b32_e32 v22, s18
+; SDAG-NEXT: v_mov_b32_e32 v23, s19
+; SDAG-NEXT: v_mov_b32_e32 v24, s20
+; SDAG-NEXT: v_mov_b32_e32 v25, s21
+; SDAG-NEXT: v_mov_b32_e32 v26, s22
+; SDAG-NEXT: v_mov_b32_e32 v27, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
-; SDAG-NEXT: v_mov_b32_e32 v13, s25
-; SDAG-NEXT: v_mov_b32_e32 v14, s26
-; SDAG-NEXT: v_mov_b32_e32 v15, s27
+; SDAG-NEXT: v_mov_b32_e32 v28, s24
+; SDAG-NEXT: v_mov_b32_e32 v29, s25
+; SDAG-NEXT: v_mov_b32_e32 v30, s26
+; SDAG-NEXT: v_mov_b32_e32 v31, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -5259,61 +5195,53 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
@@ -6302,6 +6230,6 @@ declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6
declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
-attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
+attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
attributes #1 = { "amdgpu-flat-work-group-size"="128,128" }
attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
index 6b7e8cb..31a48de 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
@@ -1,33 +1,54 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -global-isel < %s | FileCheck --check-prefixes=GCN,GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 -global-isel < %s | FileCheck --check-prefixes=GCN,GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-GISEL %s
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float>, <2 x float>, <4 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float>, <2 x float>, <16 x float>, i32, i32, i32)
define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) #0 {
-; GFX942-LABEL: test_mfma_f32_16x16x8xf32:
-; GFX942: ; %bb.0: ; %bb
-; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v2, 0x40400000
-; GFX942-NEXT: v_mov_b32_e32 v3, 4.0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_16x16x8_xf32 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT: s_nop 6
-; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: test_mfma_f32_16x16x8xf32:
+; GFX942-SDAG: ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 2.0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0x40400000
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT: s_nop 1
+; GFX942-SDAG-NEXT: v_mfma_f32_16x16x8_xf32 a[0:3], v[4:5], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: s_nop 6
+; GFX942-SDAG-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_f32_16x16x8xf32:
+; GFX942-GISEL: ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-GISEL-NEXT: s_mov_b32 s4, 1.0
+; GFX942-GISEL-NEXT: s_mov_b32 s5, 2.0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT: s_mov_b32 s4, 0x40400000
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-GISEL-NEXT: s_mov_b32 s5, 4.0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_mfma_f32_16x16x8_xf32 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: s_nop 5
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -36,42 +57,81 @@ bb:
}
define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
-; GFX942-LABEL: test_mfma_f32_32x32x4xf32:
-; GFX942: ; %bb.0: ; %bb
-; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v2, 0x40400000
-; GFX942-NEXT: v_mov_b32_e32 v3, 4.0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-NEXT: v_accvgpr_write_b32 a15, s15
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: s_nop 7
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
-; GFX942-NEXT: s_endpgm
+; GFX942-SDAG-LABEL: test_mfma_f32_32x32x4xf32:
+; GFX942-SDAG: ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 1.0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 2.0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0x40400000
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-SDAG-NEXT: s_nop 1
+; GFX942-SDAG-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: s_nop 7
+; GFX942-SDAG-NEXT: s_nop 1
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_f32_32x32x4xf32:
+; GFX942-GISEL: ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-GISEL-NEXT: s_mov_b32 s18, 1.0
+; GFX942-GISEL-NEXT: s_mov_b32 s19, 2.0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[18:19]
+; GFX942-GISEL-NEXT: s_mov_b32 s18, 0x40400000
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT: s_mov_b32 s19, 4.0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: s_nop 7
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <16 x float> %in.1, i32 1, i32 2, i32 3)
@@ -81,5 +141,4 @@ bb:
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN: {{.*}}
-; GISEL: {{.*}}
+; GFX942: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 77d4aad..b25fe83 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -44,23 +44,23 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_mov_b32_e32 v16, s16
+; GISEL-NEXT: v_mov_b32_e32 v12, s16
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[6:7]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -120,25 +120,25 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, <
; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -624,25 +624,25 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0
; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v8, s0
-; GCN-NEXT: v_mov_b32_e32 v9, s1
-; GCN-NEXT: v_mov_b32_e32 v10, s2
-; GCN-NEXT: v_mov_b32_e32 v11, s3
-; GCN-NEXT: v_mov_b32_e32 v0, s16
-; GCN-NEXT: v_mov_b32_e32 v1, s17
-; GCN-NEXT: v_mov_b32_e32 v2, s18
-; GCN-NEXT: v_mov_b32_e32 v3, s19
-; GCN-NEXT: v_mov_b32_e32 v4, s20
-; GCN-NEXT: v_mov_b32_e32 v5, s21
-; GCN-NEXT: v_mov_b32_e32 v6, s22
-; GCN-NEXT: v_mov_b32_e32 v7, s23
+; GCN-NEXT: v_mov_b32_e32 v10, s0
+; GCN-NEXT: v_mov_b32_e32 v11, s1
+; GCN-NEXT: v_mov_b32_e32 v12, s2
+; GCN-NEXT: v_mov_b32_e32 v13, s3
+; GCN-NEXT: v_mov_b32_e32 v2, s16
+; GCN-NEXT: v_mov_b32_e32 v3, s17
+; GCN-NEXT: v_mov_b32_e32 v4, s18
+; GCN-NEXT: v_mov_b32_e32 v5, s19
+; GCN-NEXT: v_mov_b32_e32 v6, s20
+; GCN-NEXT: v_mov_b32_e32 v7, s21
+; GCN-NEXT: v_mov_b32_e32 v8, s22
+; GCN-NEXT: v_mov_b32_e32 v9, s23
; GCN-NEXT: v_accvgpr_write_b32 a0, s24
; GCN-NEXT: v_accvgpr_write_b32 a1, s25
; GCN-NEXT: v_accvgpr_write_b32 a2, s26
; GCN-NEXT: v_accvgpr_write_b32 a3, s27
-; GCN-NEXT: v_mov_b32_e32 v12, s28
+; GCN-NEXT: v_mov_b32_e32 v0, s28
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[8:11], v[0:7], v12
+; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[10:13], v[2:9], v0
; GCN-NEXT: s_nop 7
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
@@ -887,24 +887,24 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v16, s2
+; GISEL-NEXT: v_mov_b32_e32 v12, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -964,25 +964,25 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x
; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -1429,24 +1429,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v16, s2
+; GISEL-NEXT: v_mov_b32_e32 v12, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1506,25 +1506,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -1598,24 +1598,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v16, s2
+; GISEL-NEXT: v_mov_b32_e32 v12, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1675,25 +1675,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -1767,24 +1767,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v16, s2
+; GISEL-NEXT: v_mov_b32_e32 v12, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1844,25 +1844,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -1936,24 +1936,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v16, s2
+; GISEL-NEXT: v_mov_b32_e32 v12, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2013,25 +2013,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -3552,4 +3552,4 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
ret <16 x float> %result
}
-attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0,0" }
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll
index 5b2de59..393581f 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll
@@ -121,18 +121,7 @@ define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-NEXT: v_max_num_f32_e32 v0, 0, v0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_min_num_f32_e32 v0, 1.0, v0
-; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.ext = fpext bfloat %src0 to float
%src1.ext = fpext bfloat %src1 to float
@@ -150,20 +139,9 @@ define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_fma_mixlo_bf16 v1, v0, v1, v2 op_sel_hi:[1,1,1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX1250-NEXT: v_max_num_f32_e32 v0, 0, v0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_min_num_f32_e32 v0, 1.0, v0
-; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_store_b16 v[0:1], v1, off scope:SCOPE_SYS
+; GFX1250-NEXT: v_fma_mixlo_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT: global_store_b16 v[0:1], v3, off scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.ext = fpext bfloat %src0 to float
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
index 557080a..1b2eb83 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
@@ -75,15 +75,8 @@ define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_post_cvt(bfloat %src0, b
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-NEXT: v_max_num_f32_e32 v0, 0, v0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_min_num_f32_e32 v0, 1.0, v0
-; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.ext = fpext bfloat %src0 to float
%src1.ext = fpext bfloat %src1 to float
@@ -199,9 +192,8 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt(<2 x bfloat> %src0, <2 x bflo
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, 0
-; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
%src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
@@ -219,16 +211,13 @@ define <3 x bfloat> @v_mad_mix_v3f32_clamp_postcvt(<3 x bfloat> %src0, <3 x bflo
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_fma_mixlo_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_fma_mixhi_bf16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v1, v3, v5 op_sel_hi:[1,1,1]
-; GFX1250-NEXT: v_pk_max_num_bf16 v1, v6, 0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT: v_fma_mixlo_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_pk_max_num_bf16 v2, v0, 0
-; GFX1250-NEXT: v_pk_min_num_bf16 v0, v1, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT: v_fma_mixhi_bf16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1250-NEXT: v_pk_min_num_bf16 v1, v2, 1.0
+; GFX1250-NEXT: v_mov_b32_e32 v0, v6
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.ext = fpext <3 x bfloat> %src0 to <3 x float>
%src1.ext = fpext <3 x bfloat> %src1 to <3 x float>
@@ -261,11 +250,8 @@ define <4 x bfloat> @v_mad_mix_v4f32_clamp_postcvt(<4 x bfloat> %src0, <4 x bflo
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, 0
-; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, 0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, 1.0 op_sel_hi:[1,0]
-; GFX1250-NEXT: v_pk_min_num_bf16 v1, v1, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
+; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.ext = fpext <4 x bfloat> %src0 to <4 x float>
%src1.ext = fpext <4 x bfloat> %src1 to <4 x float>
@@ -291,15 +277,7 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x bfloat> %src0, <2 x b
; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX1250-NEXT: v_max_num_f32_e32 v1, 0, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
-; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_min_num_f32_e32 v1, 1.0, v1
-; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_max_num_bf16 v1, v0, v0 clamp
; GFX1250-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
@@ -328,14 +306,8 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x bfloat> %src0, <2 x b
; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
-; GFX1250-NEXT: v_max_num_f32_e32 v1, 0, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
-; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_min_num_f32_e32 v1, 1.0, v1
-; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
index 9cc42ac..be02045 100644
--- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
@@ -9851,8 +9851,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s6
; CHECK-NEXT: s_cbranch_execz .LBB8_6
; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader
-; CHECK-NEXT: v_add_nc_u32_e32 v0, 0x700, v0
; CHECK-NEXT: v_add_nc_u32_e32 v1, 0x700, v1
+; CHECK-NEXT: v_add_nc_u32_e32 v0, 0x700, v0
; CHECK-NEXT: s_movk_i32 s4, 0xf800
; CHECK-NEXT: s_mov_b32 s5, -1
; CHECK-NEXT: .LBB8_5: ; %memmove_bwd_loop
@@ -11167,8 +11167,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_andn2_saveexec_b32 s6, s6
; ALIGNED-NEXT: s_cbranch_execz .LBB8_6
; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader
-; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x700, v0
; ALIGNED-NEXT: v_add_nc_u32_e32 v1, 0x700, v1
+; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x700, v0
; ALIGNED-NEXT: s_movk_i32 s4, 0xf800
; ALIGNED-NEXT: s_mov_b32 s5, -1
; ALIGNED-NEXT: .LBB8_5: ; %memmove_bwd_loop
@@ -12381,8 +12381,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:2024
; UNROLL3-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:2020
; UNROLL3-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:2016
-; UNROLL3-NEXT: v_add_nc_u32_e32 v2, 0x7b0, v0
; UNROLL3-NEXT: v_add_nc_u32_e32 v1, 0x7b0, v1
+; UNROLL3-NEXT: v_add_nc_u32_e32 v2, 0x7b0, v0
; UNROLL3-NEXT: s_waitcnt vmcnt(3)
; UNROLL3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:2028
; UNROLL3-NEXT: s_waitcnt vmcnt(2)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index bc25084..5e5e3bf 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -415,11 +415,6 @@ define amdgpu_kernel void @local_volatile_store_0(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
-; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: ds_store_b32 v0, v1
; GFX12-WGP-NEXT: s_endpgm
;
@@ -432,11 +427,6 @@ define amdgpu_kernel void @local_volatile_store_0(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
-; GFX12-CU-NEXT: s_wait_samplecnt 0x0
-; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(3) %out) {
@@ -562,11 +552,6 @@ define amdgpu_kernel void @local_volatile_store_1(
; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
-; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: ds_store_b32 v0, v1
; GFX12-WGP-NEXT: s_endpgm
;
@@ -583,11 +568,6 @@ define amdgpu_kernel void @local_volatile_store_1(
; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
-; GFX12-CU-NEXT: s_wait_samplecnt 0x0
-; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(3) %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 9c38d7f..5b0d2d2 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -3565,13 +3565,13 @@ define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x
; PACKED-SDAG-LABEL: fneg_v2f32_scalar:
; PACKED-SDAG: ; %bb.0:
; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, 0
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-SDAG-NEXT: s_xor_b32 s3, s3, 0x80000000
; PACKED-SDAG-NEXT: s_xor_b32 s2, s2, 0x80000000
-; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; PACKED-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; PACKED-SDAG-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; PACKED-SDAG-NEXT: s_endpgm
;
; PACKED-GISEL-LABEL: fneg_v2f32_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index a5c8f04..f54a383 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=greedy,1 < %s | FileCheck -check-prefix=REGALLOC-GFX908 %s
+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=greedy,2 < %s | FileCheck -check-prefix=REGALLOC-GFX908 %s
;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=prologepilog < %s | FileCheck -check-prefix=PEI-GFX908 %s
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=greedy,1 < %s | FileCheck -check-prefix=REGALLOC-GFX90A %s
+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=greedy,2 < %s | FileCheck -check-prefix=REGALLOC-GFX90A %s
;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=prologepilog < %s | FileCheck -check-prefix=PEI-GFX90A %s
; Partial reg copy and spill missed during regalloc handled later at frame lowering.
@@ -12,17 +12,21 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5
; REGALLOC-GFX908-NEXT: {{ $}}
; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32
- ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def %7
- ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %8
- ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
+ ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def %25
+ ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY %25
+ ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %27
+ ; REGALLOC-GFX908-NEXT: SI_SPILL_AV64_SAVE %27, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+ ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]]
+ ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; REGALLOC-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
- ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
+ ; REGALLOC-GFX908-NEXT: [[COPY2:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
; REGALLOC-GFX908-NEXT: [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
; REGALLOC-GFX908-NEXT: [[AV_MOV_1:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
- ; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
- ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64, %8, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
- ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
- ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
+ ; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; REGALLOC-GFX908-NEXT: [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
+ ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64, [[SI_SPILL_AV64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+ ; REGALLOC-GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
+ ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; REGALLOC-GFX908-NEXT: S_ENDPGM 0
;
; PEI-GFX908-LABEL: name: partial_copy
@@ -57,40 +61,35 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5
; REGALLOC-GFX90A-NEXT: {{ $}}
; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32
- ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def %7
- ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %8
- ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, %7, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
+ ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def %24
+ ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %24
+ ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %22
+ ; REGALLOC-GFX90A-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY %22
+ ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; REGALLOC-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
- ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
+ ; REGALLOC-GFX90A-NEXT: [[COPY2:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
; REGALLOC-GFX90A-NEXT: [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
; REGALLOC-GFX90A-NEXT: [[AV_MOV_1:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
- ; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
- ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64_align2, %8, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+ ; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64_align2, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; REGALLOC-GFX90A-NEXT: S_ENDPGM 0
;
; PEI-GFX90A-LABEL: name: partial_copy
; PEI-GFX90A: bb.0 (%ir-block.0):
- ; PEI-GFX90A-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9
+ ; PEI-GFX90A-NEXT: liveins: $sgpr4_sgpr5
; PEI-GFX90A-NEXT: {{ $}}
- ; PEI-GFX90A-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
- ; PEI-GFX90A-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
- ; PEI-GFX90A-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
- ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1
- ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
- ; PEI-GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
+ ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; PEI-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
; PEI-GFX90A-NEXT: renamable $vgpr0 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
; PEI-GFX90A-NEXT: renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
- ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
- ; PEI-GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
- ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+ ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; PEI-GFX90A-NEXT: S_ENDPGM 0
call void asm sideeffect "; use $0", "a" (i32 poison)
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
index d48bfe0..68ef30a9 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -53,31 +53,31 @@ define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) {
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s20
-; GFX942-NEXT: v_mov_b32_e32 v1, s21
-; GFX942-NEXT: v_mov_b32_e32 v2, s22
-; GFX942-NEXT: v_mov_b32_e32 v3, s23
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; GFX942-NEXT: v_mov_b32_e32 v2, s20
+; GFX942-NEXT: v_mov_b32_e32 v3, s21
+; GFX942-NEXT: v_mov_b32_e32 v4, s22
+; GFX942-NEXT: v_mov_b32_e32 v5, s23
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mov_b32_e32 v0, s16
-; GFX942-NEXT: v_mov_b32_e32 v1, s17
-; GFX942-NEXT: v_mov_b32_e32 v2, s18
-; GFX942-NEXT: v_mov_b32_e32 v3, s19
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GFX942-NEXT: v_mov_b32_e32 v2, s16
+; GFX942-NEXT: v_mov_b32_e32 v3, s17
+; GFX942-NEXT: v_mov_b32_e32 v4, s18
+; GFX942-NEXT: v_mov_b32_e32 v5, s19
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mov_b32_e32 v0, s12
-; GFX942-NEXT: v_mov_b32_e32 v1, s13
-; GFX942-NEXT: v_mov_b32_e32 v2, s14
-; GFX942-NEXT: v_mov_b32_e32 v3, s15
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX942-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-NEXT: v_mov_b32_e32 v3, s13
+; GFX942-NEXT: v_mov_b32_e32 v4, s14
+; GFX942-NEXT: v_mov_b32_e32 v5, s15
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mov_b32_e32 v0, s8
-; GFX942-NEXT: v_mov_b32_e32 v1, s9
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: v_mov_b32_e32 v3, s11
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, s8
+; GFX942-NEXT: v_mov_b32_e32 v3, s9
+; GFX942-NEXT: v_mov_b32_e32 v4, s10
+; GFX942-NEXT: v_mov_b32_e32 v5, s11
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
store <16 x i32> %a, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/read-write-register-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/read-write-register-illegal-type.ll
new file mode 100644
index 0000000..2324f3f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/read-write-register-illegal-type.ll
@@ -0,0 +1,29 @@
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
+
+; CHECK: error: <unknown>:0:0: cannot use llvm.read_register with illegal type
+define amdgpu_kernel void @test_read_register_i9(ptr addrspace(1) %out) nounwind {
+ %reg = call i9 @llvm.read_register.i9(metadata !0)
+ store i9 %reg, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: error: <unknown>:0:0: cannot use llvm.write_register with illegal type
+define amdgpu_kernel void @test_write_register_i9(ptr addrspace(1) %out) nounwind {
+ call void @llvm.write_register.i9(metadata !0, i9 42)
+ ret void
+}
+
+; CHECK: error: <unknown>:0:0: cannot use llvm.read_register with illegal type
+define amdgpu_kernel void @test_read_register_i128(ptr addrspace(1) %out) nounwind {
+ %reg = call i128 @llvm.read_register.i128(metadata !0)
+ store i128 %reg, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: error: <unknown>:0:0: cannot use llvm.write_register with illegal type
+define amdgpu_kernel void @test_write_register_i128(ptr addrspace(1) %out) nounwind {
+ call void @llvm.write_register.i128(metadata !0, i128 42)
+ ret void
+}
+
+!0 = !{!"m0"}
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll b/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
index f2fd3a8..c035e9f 100644
--- a/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
@@ -9,9 +9,9 @@
%asm.output = type { <16 x i32>, <8 x i32>, <5 x i32>, <4 x i32>, <16 x i32> }
; CHECK-LABEL: {{^}}illegal_eviction_assert:
-; CHECK: ; def v[4:19] v[20:27] v[0:4] v[0:3] a[0:15]
+; CHECK: ; def v[13:28] v[0:7] v[8:12] v[0:3] a[0:15]
; CHECK: ; clobber
-; CHECK: ; use v[4:19] v[20:27] v[0:4] v[0:3] a[1:16]
+; CHECK: ; use v[13:28] v[0:7] v[8:12] v[0:3] a[1:16]
define void @illegal_eviction_assert(ptr addrspace(1) %arg) #0 {
;%agpr0 = call i32 asm sideeffect "; def $0","=${a0}"()
%asm = call %asm.output asm sideeffect "; def $0 $1 $2 $3 $4","=v,=v,=v,=v,={a[0:15]}"()
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
index e8e122e..bbb9df9 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
@@ -59,19 +59,19 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX90A-LABEL: scalar_to_vector_v8i16:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4
+; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, s3
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX90A-NEXT: v_mov_b32_e32 v3, s1
+; GFX90A-NEXT: v_mov_b32_e32 v4, s0
+; GFX90A-NEXT: v_mov_b32_e32 v5, s0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GFX90A-NEXT: s_endpgm
entry:
%val.1.i32 = extractelement <2 x i32> %in, i64 0
@@ -146,19 +146,19 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
; GFX90A-LABEL: scalar_to_vector_v8f16:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4
+; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, s3
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s0
+; GFX90A-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX90A-NEXT: v_mov_b32_e32 v3, s1
+; GFX90A-NEXT: v_mov_b32_e32 v5, s0
+; GFX90A-NEXT: v_mov_b32_e32 v4, s0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GFX90A-NEXT: s_endpgm
entry:
%val.1.float = extractelement <2 x float> %in, i64 0
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
index 9361187..4d864ad 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
@@ -25,27 +25,27 @@ define void @shufflevector_v2i32_10_physreg_even_vgpr_pair_copy(ptr addrspace(1)
; GFX90A-LABEL: shufflevector_v2i32_10_physreg_even_vgpr_pair_copy:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4, v5
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: shufflevector_v2i32_10_physreg_even_vgpr_pair_copy:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v4, v5
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%asm = call { i32, i32 } asm "; def $0, $1", "={v4},={v5}"()
@@ -214,27 +214,27 @@ define void @shufflevector_v2i32_11_physreg_even_vgpr_pair_copy(ptr addrspace(1)
; GFX90A-LABEL: shufflevector_v2i32_11_physreg_even_vgpr_pair_copy:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4, v5
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: shufflevector_v2i32_11_physreg_even_vgpr_pair_copy:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v4, v5
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v5
+; GFX940-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%asm = call { i32, i32 } asm "; def $0, $1", "={v4},={v5}"()
@@ -265,31 +265,31 @@ define void @shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy(ptr addrspace(
; GFX90A-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4, v5, v6, v7
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v10, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v6
+; GFX90A-NEXT: v_mov_b32_e32 v8, v7
+; GFX90A-NEXT: v_mov_b32_e32 v11, v4
+; GFX90A-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v8, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v4, v5, v6, v7
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v10, v5
+; GFX940-NEXT: v_mov_b32_e32 v9, v6
+; GFX940-NEXT: v_mov_b32_e32 v8, v7
+; GFX940-NEXT: v_mov_b32_e32 v11, v4
+; GFX940-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%asm = call { i32, i32, i32, i32 } asm "; def $0, $1, $2, $3", "={v4},={v5},={v6},={v7}"()
@@ -327,31 +327,31 @@ define void @shufflevector_v4i32_1032_physreg_even_vgpr_quad_copy(ptr addrspace(
; GFX90A-LABEL: shufflevector_v4i32_1032_physreg_even_vgpr_quad_copy:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4, v5, v6, v7
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v3, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v11, v6
+; GFX90A-NEXT: v_mov_b32_e32 v10, v7
+; GFX90A-NEXT: v_mov_b32_e32 v9, v4
+; GFX90A-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: shufflevector_v4i32_1032_physreg_even_vgpr_quad_copy:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v8, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v4, v5, v6, v7
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v3, v6
-; GFX940-NEXT: v_mov_b32_e32 v2, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v8, v5
+; GFX940-NEXT: v_mov_b32_e32 v11, v6
+; GFX940-NEXT: v_mov_b32_e32 v10, v7
+; GFX940-NEXT: v_mov_b32_e32 v9, v4
+; GFX940-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%asm = call { i32, i32, i32, i32 } asm "; def $0, $1, $2, $3", "={v4},={v5},={v6},={v7}"()
@@ -746,16 +746,15 @@ define i32 @shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt(p
; GFX90A-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4, v5, v6, v7
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
-; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v6
+; GFX90A-NEXT: v_mov_b32_e32 v8, v7
+; GFX90A-NEXT: v_mov_b32_e32 v11, v4
+; GFX90A-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -763,17 +762,16 @@ define i32 @shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt(p
; GFX940-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v8, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v4, v5, v6, v7
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
-; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_mov_b32_e32 v10, v5
+; GFX940-NEXT: v_mov_b32_e32 v9, v6
+; GFX940-NEXT: v_mov_b32_e32 v8, v7
+; GFX940-NEXT: v_mov_b32_e32 v11, v4
+; GFX940-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
; GFX940-NEXT: v_mov_b32_e32 v0, v6
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index 44c88f8..eb0d546 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -127,34 +127,26 @@ define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
; GFX908: ; %bb.0:
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a0
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a2
-; GFX908-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
-; GFX908-NEXT: v_accvgpr_write_b32 a8, v7
-; GFX908-NEXT: v_accvgpr_write_b32 a2, v8
-; GFX908-NEXT: v_accvgpr_write_b32 a3, v2
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
; GFX908-NEXT: v_accvgpr_write_b32 a4, v3
-; GFX908-NEXT: v_accvgpr_write_b32 a7, v6
-; GFX908-NEXT: v_accvgpr_write_b32 a6, v5
-; GFX908-NEXT: v_accvgpr_write_b32 a5, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v1
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v5
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: s_endpgm
@@ -173,16 +165,16 @@ define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v9, a3
-; GFX90A-NEXT: v_accvgpr_read_b32 v8, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a3
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a2
; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1
; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v8
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v9
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v5
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ssubo.ll b/llvm/test/CodeGen/AMDGPU/ssubo.ll
index 053038d..382d892 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubo.ll
@@ -1,14 +1,116 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=VI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s --check-prefix=GFX9
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GFX10
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11
declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
declare { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-; FUNC-LABEL: {{^}}ssubo_i64_zext:
define amdgpu_kernel void @ssubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
+; SI-LABEL: ssubo_i64_zext:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: s_sub_u32 s10, s2, s8
+; SI-NEXT: s_subb_u32 s11, s3, s9
+; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
+; SI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[8:9], 0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: v_mov_b32_e32 v1, s11
+; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0
+; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: ssubo_i64_zext:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: s_sub_u32 s6, s2, s4
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: s_subb_u32 s7, s3, s5
+; VI-NEXT: v_cmp_gt_i64_e64 s[8:9], s[4:5], 0
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[1:2]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: ssubo_i64_zext:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: s_sub_u32 s4, s2, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: s_subb_u32 s5, s3, s7
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[8:9], s[6:7], 0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: ssubo_i64_zext:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_sub_u32 s4, s2, s6
+; GFX10-NEXT: s_subb_u32 s5, s3, s7
+; GFX10-NEXT: v_cmp_gt_i64_e64 s6, s[6:7], 0
+; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3]
+; GFX10-NEXT: s_xor_b32 s2, s6, s2
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX10-NEXT: v_add_co_u32 v0, s2, s4, v0
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s5, 0, s2
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: ssubo_i64_zext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_sub_u32 s6, s2, s4
+; GFX11-NEXT: s_subb_u32 s7, s3, s5
+; GFX11-NEXT: v_cmp_gt_i64_e64 s4, s[4:5], 0
+; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], s[2:3]
+; GFX11-NEXT: s_xor_b32 s2, s4, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-NEXT: v_add_co_u32 v0, s2, s6, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
%val = extractvalue { i64, i1 } %ssub, 0
%carry = extractvalue { i64, i1 } %ssub, 1
@@ -18,8 +120,102 @@ define amdgpu_kernel void @ssubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
ret void
}
-; FUNC-LABEL: {{^}}s_ssubo_i32:
define amdgpu_kernel void @s_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) nounwind {
+; SI-LABEL: s_ssubo_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_sub_i32 s12, s8, s9
+; SI-NEXT: s_cmp_gt_i32 s9, 0
+; SI-NEXT: s_cselect_b64 s[10:11], -1, 0
+; SI-NEXT: s_cmp_lt_i32 s12, s8
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_cselect_b64 s[8:9], -1, 0
+; SI-NEXT: v_mov_b32_e32 v0, s12
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_xor_b64 s[4:5], s[10:11], s[8:9]
+; SI-NEXT: s_mov_b32 s0, s2
+; SI-NEXT: s_mov_b32 s1, s3
+; SI-NEXT: s_mov_b32 s2, s6
+; SI-NEXT: s_mov_b32 s3, s7
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_ssubo_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_sub_i32 s6, s4, s5
+; VI-NEXT: s_cmp_gt_i32 s5, 0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: s_cmp_lt_i32 s6, s4
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT: v_mov_b32_e32 v4, s6
+; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT: flat_store_dword v[0:1], v4
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: flat_store_byte v[2:3], v0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_ssubo_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: s_sub_i32 s4, s6, s7
+; GFX9-NEXT: v_sub_i32 v1, s6, v1 clamp
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_ssubo_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_sub_nc_i32 v0, s6, s7 clamp
+; GFX10-NEXT: s_sub_i32 s4, s6, s7
+; GFX10-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s4, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: global_store_dword v1, v2, s[0:1]
+; GFX10-NEXT: global_store_byte v1, v0, s[2:3]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_ssubo_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_sub_nc_i32 v0, s6, s7 clamp
+; GFX11-NEXT: s_sub_i32 s4, s6, s7
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s4, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v1, v2, s[0:1]
+; GFX11-NEXT: global_store_b8 v1, v0, s[2:3]
+; GFX11-NEXT: s_endpgm
%ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
%val = extractvalue { i32, i1 } %ssub, 0
%carry = extractvalue { i32, i1 } %ssub, 1
@@ -28,8 +224,112 @@ define amdgpu_kernel void @s_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
ret void
}
-; FUNC-LABEL: {{^}}v_ssubo_i32:
define amdgpu_kernel void @v_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
+; SI-LABEL: v_ssubo_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s4
+; SI-NEXT: s_mov_b32 s13, s5
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
+; SI-NEXT: s_mov_b32 s6, s10
+; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s4, s2
+; SI-NEXT: s_mov_b32 s5, s3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_sub_i32_e32 v2, vcc, v0, v1
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1
+; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v0
+; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0
+; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: v_ssubo_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: flat_load_dword v5, v[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_sub_u32_e32 v6, vcc, v4, v5
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5
+; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v6, v4
+; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: flat_store_dword v[0:1], v6
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: flat_store_byte v[2:3], v0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_ssubo_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX9-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_sub_i32 v3, v1, v2 clamp
+; GFX9-NEXT: v_sub_u32_e32 v1, v1, v2
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v3
+; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_ssubo_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_nc_i32 v3, v1, v2 clamp
+; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v2
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_ssubo_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_nc_i32 v3, v1, v2 clamp
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, v1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %aptr, align 4
%b = load i32, ptr addrspace(1) %bptr, align 4
%ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
@@ -40,10 +340,109 @@ define amdgpu_kernel void @v_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
ret void
}
-; FUNC-LABEL: {{^}}s_ssubo_i64:
-; GCN: s_sub_u32
-; GCN: s_subb_u32
define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) nounwind {
+; SI-LABEL: s_ssubo_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_sub_u32 s12, s4, s6
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: s_subb_u32 s13, s5, s7
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
+; SI-NEXT: v_cmp_gt_i64_e64 s[4:5], s[6:7], 0
+; SI-NEXT: v_mov_b32_e32 v0, s12
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: v_mov_b32_e32 v1, s13
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], vcc
+; SI-NEXT: s_mov_b32 s0, s2
+; SI-NEXT: s_mov_b32 s1, s3
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_ssubo_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_sub_u32 s0, s4, s6
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_subb_u32 s1, s5, s7
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
+; VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: flat_store_byte v[2:3], v0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_ssubo_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_sub_u32 s0, s12, s14
+; GFX9-NEXT: v_mov_b32_e32 v0, s12
+; GFX9-NEXT: v_mov_b32_e32 v1, s13
+; GFX9-NEXT: s_subb_u32 s1, s13, s15
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[14:15], 0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT: global_store_byte v2, v0, s[10:11]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_ssubo_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_sub_u32 s0, s12, s14
+; GFX10-NEXT: s_subb_u32 s1, s13, s15
+; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[14:15], 0
+; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[12:13]
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: s_xor_b32 s0, s2, s3
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_byte v2, v3, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_ssubo_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_sub_u32 s8, s4, s6
+; GFX11-NEXT: s_subb_u32 s9, s5, s7
+; GFX11-NEXT: v_cmp_gt_i64_e64 s6, s[6:7], 0
+; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5]
+; GFX11-NEXT: v_mov_b32_e32 v0, s8
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX11-NEXT: s_xor_b32 s4, s6, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: s_endpgm
%ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
%val = extractvalue { i64, i1 } %ssub, 0
%carry = extractvalue { i64, i1 } %ssub, 1
@@ -52,16 +451,121 @@ define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
ret void
}
-; FUNC-LABEL: {{^}}v_ssubo_i64:
-; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc,
-; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
-
-; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc,
-; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
-
-; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc,
-; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc,
define amdgpu_kernel void @v_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
+; SI-LABEL: v_ssubo_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s4
+; SI-NEXT: s_mov_b32 s13, s5
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
+; SI-NEXT: s_mov_b32 s6, s10
+; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s4, s2
+; SI-NEXT: s_mov_b32 s5, s3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
+; SI-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc
+; SI-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3]
+; SI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
+; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0
+; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: v_ssubo_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v6, s2
+; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_sub_u32_e32 v8, vcc, v0, v2
+; VI-NEXT: v_subb_u32_e32 v9, vcc, v1, v3, vcc
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3]
+; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1]
+; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9]
+; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: flat_store_byte v[6:7], v0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_ssubo_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[12:13]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[14:15]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
+; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9]
+; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT: global_store_byte v6, v0, s[10:11]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_ssubo_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v6, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[12:13]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
+; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
+; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
+; GFX10-NEXT: s_xor_b32 s0, vcc_lo, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9]
+; GFX10-NEXT: global_store_byte v6, v0, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_ssubo_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[4:11], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v6, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b64 v[0:1], v6, s[8:9]
+; GFX11-NEXT: global_load_b64 v[2:3], v6, s[10:11]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
+; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
+; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
+; GFX11-NEXT: s_xor_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v6, v[4:5], s[4:5]
+; GFX11-NEXT: global_store_b8 v6, v0, s[6:7]
+; GFX11-NEXT: s_endpgm
%a = load i64, ptr addrspace(1) %aptr, align 4
%b = load i64, ptr addrspace(1) %bptr, align 4
%ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
@@ -72,14 +576,134 @@ define amdgpu_kernel void @v_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
ret void
}
-; FUNC-LABEL: {{^}}v_ssubo_v2i32:
-; SICIVI: v_cmp_lt_i32
-; SICIVI: v_cmp_lt_i32
-; SICIVI: v_sub_{{[iu]}}32
-; SICIVI: v_cmp_lt_i32
-; SICIVI: v_cmp_lt_i32
-; SICIVI: v_sub_{{[iu]}}32
define amdgpu_kernel void @v_ssubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
+; SI-LABEL: v_ssubo_v2i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s4
+; SI-NEXT: s_mov_b32 s13, s5
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
+; SI-NEXT: s_mov_b32 s6, s10
+; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
+; SI-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
+; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 0, v3
+; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v2
+; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], v4, v0
+; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; SI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3]
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: v_ssubo_v2i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v6, s2
+; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_sub_u32_e32 v9, vcc, v1, v3
+; VI-NEXT: v_sub_u32_e32 v8, vcc, v0, v2
+; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], 0, v3
+; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v2
+; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v0
+; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3]
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9]
+; VI-NEXT: flat_store_dwordx2 v[6:7], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_ssubo_v2i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[12:13]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[14:15]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_sub_u32_e32 v5, v1, v3
+; GFX9-NEXT: v_sub_i32 v1, v1, v3 clamp
+; GFX9-NEXT: v_sub_u32_e32 v4, v0, v2
+; GFX9-NEXT: v_sub_i32 v0, v0, v2 clamp
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v5, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v4, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9]
+; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[10:11]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_ssubo_v2i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v5, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v5, s[12:13]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v5, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_nc_u32_e32 v4, v1, v3
+; GFX10-NEXT: v_sub_nc_i32 v1, v1, v3 clamp
+; GFX10-NEXT: v_sub_nc_u32_e32 v3, v0, v2
+; GFX10-NEXT: v_sub_nc_i32 v0, v0, v2 clamp
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: global_store_dwordx2 v5, v[3:4], s[8:9]
+; GFX10-NEXT: global_store_dwordx2 v5, v[0:1], s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_ssubo_v2i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b64 v[0:1], v5, s[4:5]
+; GFX11-NEXT: global_load_b64 v[2:3], v5, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_nc_u32_e32 v4, v1, v3
+; GFX11-NEXT: v_sub_nc_i32 v1, v1, v3 clamp
+; GFX11-NEXT: v_sub_nc_u32_e32 v3, v0, v2
+; GFX11-NEXT: v_sub_nc_i32 v0, v0, v2 clamp
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v5, v[3:4], s[0:1]
+; GFX11-NEXT: global_store_b64 v5, v[0:1], s[2:3]
+; GFX11-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
%b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
%sadd = call { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
diff --git a/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll b/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll
new file mode 100644
index 0000000..42436a1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll
@@ -0,0 +1,180 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX900 %s
+
+
+%pair = type { i32, i32 }
+
+define void @test_extractvalue_then_else(ptr %ptr, i1 %cond) {
+; GFX900-LABEL: test_extractvalue_then_else:
+; GFX900: ; %bb.0: ; %if
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: flat_load_dword v3, v[0:1]
+; GFX900-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX900-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2
+; GFX900-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX900-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX900-NEXT: s_cbranch_execz .LBB0_2
+; GFX900-NEXT: ; %bb.1: ; %else
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_add_u32_e32 v3, 1, v3
+; GFX900-NEXT: .LBB0_2: ; %Flow
+; GFX900-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX900-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: flat_store_dword v[0:1], v3
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+if:
+ %load_then = load %pair, ptr %ptr
+ br i1 %cond, label %then, label %else
+
+then:
+ %a_then = extractvalue %pair %load_then, 0
+ br label %merge
+
+else:
+ %a_else = extractvalue %pair %load_then, 0
+ %sum_else = add i32 %a_else, 1
+ br label %merge
+
+merge:
+ %phi = phi i32 [ %a_then, %then ], [ %sum_else, %else ]
+ store i32 %phi, ptr %ptr
+ ret void
+}
+
+define void @test_extractvalue_else_then(ptr %ptr, i1 %cond) {
+; GFX900-LABEL: test_extractvalue_else_then:
+; GFX900: ; %bb.0: ; %if
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: flat_load_dword v3, v[0:1]
+; GFX900-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX900-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2
+; GFX900-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX900-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX900-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX900-NEXT: s_cbranch_execz .LBB1_2
+; GFX900-NEXT: ; %bb.1: ; %else
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_add_u32_e32 v3, 1, v3
+; GFX900-NEXT: .LBB1_2: ; %merge
+; GFX900-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: flat_store_dword v[0:1], v3
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+if:
+ %load_then = load %pair, ptr %ptr
+ br i1 %cond, label %else, label %then
+
+else:
+ %a_else = extractvalue %pair %load_then, 0
+ %sum_else = add i32 %a_else, 1
+ br label %merge
+
+then:
+ %a_then = extractvalue %pair %load_then, 0
+ br label %merge
+
+merge:
+ %phi = phi i32 [ %a_then, %then ], [ %sum_else, %else ]
+ store i32 %phi, ptr %ptr
+ ret void
+}
+
+define amdgpu_kernel void @test_loop_with_if( ptr %ptr, i1 %cond) #0 {
+; GFX900-LABEL: test_loop_with_if:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_mov_b32_e32 v5, 0
+; GFX900-NEXT: s_mov_b64 s[4:5], 0
+; GFX900-NEXT: s_movk_i32 s10, 0xfe
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: s_bitcmp1_b32 s2, 0
+; GFX900-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX900-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[2:3]
+; GFX900-NEXT: v_mov_b32_e32 v2, s1
+; GFX900-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; GFX900-NEXT: v_mov_b32_e32 v1, s0
+; GFX900-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v3
+; GFX900-NEXT: s_branch .LBB2_2
+; GFX900-NEXT: .LBB2_1: ; %latch
+; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; GFX900-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_add_u32_e32 v5, 20, v3
+; GFX900-NEXT: v_cmp_lt_i32_e32 vcc, s10, v5
+; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT: flat_store_dword v[1:2], v3
+; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX900-NEXT: s_cbranch_execz .LBB2_8
+; GFX900-NEXT: .LBB2_2: ; %loop
+; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT: flat_load_dwordx2 v[3:4], v[1:2]
+; GFX900-NEXT: s_and_b64 vcc, exec, s[0:1]
+; GFX900-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX900-NEXT: s_mov_b64 s[6:7], 0
+; GFX900-NEXT: s_cbranch_vccnz .LBB2_4
+; GFX900-NEXT: ; %bb.3: ; %if
+; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; GFX900-NEXT: v_cmp_gt_i32_e32 vcc, 11, v5
+; GFX900-NEXT: s_andn2_b64 s[8:9], s[2:3], exec
+; GFX900-NEXT: s_and_b64 s[12:13], vcc, exec
+; GFX900-NEXT: s_mov_b64 s[6:7], -1
+; GFX900-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX900-NEXT: .LBB2_4: ; %Flow
+; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; GFX900-NEXT: s_and_saveexec_b64 s[12:13], s[8:9]
+; GFX900-NEXT: s_xor_b64 s[8:9], exec, s[12:13]
+; GFX900-NEXT: s_cbranch_execz .LBB2_6
+; GFX900-NEXT: ; %bb.5: ; %else
+; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_add_u32_e32 v3, v3, v4
+; GFX900-NEXT: s_andn2_b64 s[6:7], s[6:7], exec
+; GFX900-NEXT: .LBB2_6: ; %Flow1
+; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; GFX900-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX900-NEXT: s_and_saveexec_b64 s[8:9], s[6:7]
+; GFX900-NEXT: s_cbranch_execz .LBB2_1
+; GFX900-NEXT: ; %bb.7: ; %then
+; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; GFX900-NEXT: flat_store_dword v[1:2], v0
+; GFX900-NEXT: s_branch .LBB2_1
+; GFX900-NEXT: .LBB2_8: ; %end
+; GFX900-NEXT: s_endpgm
+entry:
+ %a = tail call i32 @llvm.amdgcn.workitem.id.x()
+ br label %loop
+
+loop:
+ %entry_phi = phi i32 [ 0, %entry ], [ %a15, %latch ]
+ %load = load %pair, ptr %ptr
+ br i1 %cond, label %if, label %else
+
+if:
+ %cmp = icmp sgt i32 %entry_phi, 10
+ br i1 %cmp, label %then, label %else
+
+then:
+ %a_then = extractvalue %pair %load, 0
+ store i32 %a, ptr %ptr, align 4
+ br label %latch
+
+else:
+ %a2 = extractvalue %pair %load, 1
+ %y = extractvalue %pair %load, 0
+ %a_else = add i32 %y, %a2
+ br label %latch
+
+latch:
+ %a_test = phi i32 [ %a_then, %then ], [ %a_else, %else ]
+ store i32 %a_test, ptr %ptr
+ %a15 = add nsw i32 %a_test, 20
+ %a16 = icmp slt i32 %a15, 255
+ br i1 %a16, label %loop, label %end
+
+end:
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index d23e314..f6c357d 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -70,12 +70,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9]
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[8:9], 1, v1
-; GLOBALNESS1-NEXT: ; implicit-def: $vgpr59 : SGPR spill to VGPR lane
+; GLOBALNESS1-NEXT: ; implicit-def: $vgpr57 : SGPR spill to VGPR lane
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0
; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s8, 0
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s8, 0
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[68:69], 1, v0
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s9, 1
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s9, 1
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[70:71], 1, v3
; GLOBALNESS1-NEXT: v_mov_b32_e32 v46, 0x80
; GLOBALNESS1-NEXT: s_mov_b32 s82, s16
@@ -84,7 +84,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11]
; GLOBALNESS1-NEXT: v_mov_b32_e32 v47, 0
; GLOBALNESS1-NEXT: s_mov_b32 s32, 0
-; GLOBALNESS1-NEXT: ; implicit-def: $vgpr56_vgpr57
+; GLOBALNESS1-NEXT: ; implicit-def: $vgpr58_vgpr59
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -93,24 +93,24 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0
; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s4, 2
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s4, 2
; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s5, 3
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s5, 3
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v3
; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s4, 4
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s5, 5
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s4, 4
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s5, 5
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v2
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s4, 6
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s5, 7
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s4, 6
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s5, 7
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[80:81], 1, v1
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s70, 8
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s71, 9
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s70, 8
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s71, 9
; GLOBALNESS1-NEXT: s_branch .LBB1_4
; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: v_readlane_b32 s6, v59, 6
-; GLOBALNESS1-NEXT: v_readlane_b32 s7, v59, 7
+; GLOBALNESS1-NEXT: v_readlane_b32 s6, v57, 6
+; GLOBALNESS1-NEXT: v_readlane_b32 s7, v57, 7
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7]
; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_28
; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow15
@@ -120,7 +120,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: .LBB1_3: ; %Flow28
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS1-NEXT: v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
+; GLOBALNESS1-NEXT: v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_29
; GLOBALNESS1-NEXT: .LBB1_4: ; %bb5
; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1
@@ -128,7 +128,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: flat_load_dword v40, v[46:47]
; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40
; GLOBALNESS1-NEXT: buffer_store_dword v42, off, s[0:3], 0
-; GLOBALNESS1-NEXT: flat_load_dword v58, v[46:47]
+; GLOBALNESS1-NEXT: flat_load_dword v56, v[46:47]
; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0
; GLOBALNESS1-NEXT: s_getpc_b64 s[4:5]
; GLOBALNESS1-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
@@ -186,10 +186,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[44:45], off
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s8, 10
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s9, 11
-; GLOBALNESS1-NEXT: v_readlane_b32 s4, v59, 2
-; GLOBALNESS1-NEXT: v_readlane_b32 s5, v59, 3
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s8, 10
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s9, 11
+; GLOBALNESS1-NEXT: v_readlane_b32 s4, v57, 2
+; GLOBALNESS1-NEXT: v_readlane_b32 s5, v57, 3
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_13
; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i
@@ -198,7 +198,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS1-NEXT: .LBB1_13: ; %bb44.lr.ph.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58
+; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56
; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
@@ -228,8 +228,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21
; GLOBALNESS1-NEXT: ; %bb.19: ; %bb3.i.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT: v_readlane_b32 s4, v59, 0
-; GLOBALNESS1-NEXT: v_readlane_b32 s5, v59, 1
+; GLOBALNESS1-NEXT: v_readlane_b32 s4, v57, 0
+; GLOBALNESS1-NEXT: v_readlane_b32 s5, v57, 1
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21
; GLOBALNESS1-NEXT: ; %bb.20: ; %bb6.i.i
@@ -265,7 +265,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_mov_b32 s13, s83
; GLOBALNESS1-NEXT: s_mov_b32 s14, s82
; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41
-; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[56:57], off
+; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[58:59], off
; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[54:55]
; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[96:97]
; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14
@@ -277,13 +277,13 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: .LBB1_24: ; %Flow23
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: s_load_dwordx4 s[4:7], s[38:39], 0x0
-; GLOBALNESS1-NEXT: v_readlane_b32 s70, v59, 8
-; GLOBALNESS1-NEXT: v_readlane_b32 s8, v59, 10
+; GLOBALNESS1-NEXT: v_readlane_b32 s70, v57, 8
+; GLOBALNESS1-NEXT: v_readlane_b32 s8, v57, 10
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS1-NEXT: v_readlane_b32 s71, v59, 9
+; GLOBALNESS1-NEXT: v_readlane_b32 s71, v57, 9
; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0)
; GLOBALNESS1-NEXT: s_mov_b32 s55, s7
-; GLOBALNESS1-NEXT: v_readlane_b32 s9, v59, 11
+; GLOBALNESS1-NEXT: v_readlane_b32 s9, v57, 11
; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow24
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[52:53]
@@ -291,8 +291,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2
; GLOBALNESS1-NEXT: ; %bb.26: ; %bb67.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: v_readlane_b32 s6, v59, 4
-; GLOBALNESS1-NEXT: v_readlane_b32 s7, v59, 5
+; GLOBALNESS1-NEXT: v_readlane_b32 s6, v57, 4
+; GLOBALNESS1-NEXT: v_readlane_b32 s7, v57, 5
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1
; GLOBALNESS1-NEXT: ; %bb.27: ; %bb69.i
@@ -384,12 +384,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9]
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[8:9], 1, v1
-; GLOBALNESS0-NEXT: ; implicit-def: $vgpr59 : SGPR spill to VGPR lane
+; GLOBALNESS0-NEXT: ; implicit-def: $vgpr57 : SGPR spill to VGPR lane
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0
; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s8, 0
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s8, 0
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[68:69], 1, v0
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s9, 1
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s9, 1
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[84:85], 1, v3
; GLOBALNESS0-NEXT: v_mov_b32_e32 v46, 0x80
; GLOBALNESS0-NEXT: s_mov_b32 s70, s16
@@ -398,7 +398,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11]
; GLOBALNESS0-NEXT: v_mov_b32_e32 v47, 0
; GLOBALNESS0-NEXT: s_mov_b32 s32, 0
-; GLOBALNESS0-NEXT: ; implicit-def: $vgpr56_vgpr57
+; GLOBALNESS0-NEXT: ; implicit-def: $vgpr58_vgpr59
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -407,24 +407,24 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0
; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s4, 2
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s4, 2
; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s5, 3
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s5, 3
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v3
; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s4, 4
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s5, 5
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s4, 4
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s5, 5
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v2
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s4, 6
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s5, 7
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s4, 6
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s5, 7
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[80:81], 1, v1
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s84, 8
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s85, 9
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s84, 8
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s85, 9
; GLOBALNESS0-NEXT: s_branch .LBB1_4
; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_readlane_b32 s6, v59, 6
-; GLOBALNESS0-NEXT: v_readlane_b32 s7, v59, 7
+; GLOBALNESS0-NEXT: v_readlane_b32 s6, v57, 6
+; GLOBALNESS0-NEXT: v_readlane_b32 s7, v57, 7
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7]
; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_28
; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow15
@@ -434,7 +434,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: .LBB1_3: ; %Flow28
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS0-NEXT: v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
+; GLOBALNESS0-NEXT: v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_29
; GLOBALNESS0-NEXT: .LBB1_4: ; %bb5
; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1
@@ -442,7 +442,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: flat_load_dword v40, v[46:47]
; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40
; GLOBALNESS0-NEXT: buffer_store_dword v42, off, s[0:3], 0
-; GLOBALNESS0-NEXT: flat_load_dword v58, v[46:47]
+; GLOBALNESS0-NEXT: flat_load_dword v56, v[46:47]
; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0
; GLOBALNESS0-NEXT: s_getpc_b64 s[4:5]
; GLOBALNESS0-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
@@ -500,10 +500,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[44:45], off
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s8, 10
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s9, 11
-; GLOBALNESS0-NEXT: v_readlane_b32 s4, v59, 2
-; GLOBALNESS0-NEXT: v_readlane_b32 s5, v59, 3
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s8, 10
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s9, 11
+; GLOBALNESS0-NEXT: v_readlane_b32 s4, v57, 2
+; GLOBALNESS0-NEXT: v_readlane_b32 s5, v57, 3
; GLOBALNESS0-NEXT: s_mov_b32 s83, s55
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_13
@@ -513,7 +513,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS0-NEXT: .LBB1_13: ; %bb44.lr.ph.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58
+; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56
; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
@@ -543,8 +543,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21
; GLOBALNESS0-NEXT: ; %bb.19: ; %bb3.i.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT: v_readlane_b32 s4, v59, 0
-; GLOBALNESS0-NEXT: v_readlane_b32 s5, v59, 1
+; GLOBALNESS0-NEXT: v_readlane_b32 s4, v57, 0
+; GLOBALNESS0-NEXT: v_readlane_b32 s5, v57, 1
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21
; GLOBALNESS0-NEXT: ; %bb.20: ; %bb6.i.i
@@ -580,7 +580,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_mov_b32 s13, s71
; GLOBALNESS0-NEXT: s_mov_b32 s14, s70
; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41
-; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[56:57], off
+; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[58:59], off
; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[54:55]
; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[96:97]
; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14
@@ -591,12 +591,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_branch .LBB1_14
; GLOBALNESS0-NEXT: .LBB1_24: ; %Flow23
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_readlane_b32 s84, v59, 8
-; GLOBALNESS0-NEXT: v_readlane_b32 s8, v59, 10
+; GLOBALNESS0-NEXT: v_readlane_b32 s84, v57, 8
+; GLOBALNESS0-NEXT: v_readlane_b32 s8, v57, 10
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0
; GLOBALNESS0-NEXT: s_mov_b32 s55, s83
-; GLOBALNESS0-NEXT: v_readlane_b32 s85, v59, 9
-; GLOBALNESS0-NEXT: v_readlane_b32 s9, v59, 11
+; GLOBALNESS0-NEXT: v_readlane_b32 s85, v57, 9
+; GLOBALNESS0-NEXT: v_readlane_b32 s9, v57, 11
; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow24
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[52:53]
@@ -604,8 +604,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2
; GLOBALNESS0-NEXT: ; %bb.26: ; %bb67.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_readlane_b32 s6, v59, 4
-; GLOBALNESS0-NEXT: v_readlane_b32 s7, v59, 5
+; GLOBALNESS0-NEXT: v_readlane_b32 s6, v57, 4
+; GLOBALNESS0-NEXT: v_readlane_b32 s7, v57, 5
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1
; GLOBALNESS0-NEXT: ; %bb.27: ; %bb69.i
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index d230ff5..e1574dc 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=VI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s --check-prefix=GFX9
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GFX10
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11
define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
; SI-LABEL: s_uaddo_i64_zext:
@@ -12,14 +14,14 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_add_u32 s0, s2, s8
; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_addc_u32 s1, s3, s9
+; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -61,6 +63,40 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_uaddo_i64_zext:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_add_u32 s4, s2, s6
+; GFX10-NEXT: s_addc_u32 s5, s3, s7
+; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[4:5], s[2:3]
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX10-NEXT: v_add_co_u32 v0, s2, s4, v0
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s5, 0, s2
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_uaddo_i64_zext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_u32 s4, s2, s4
+; GFX11-NEXT: s_addc_u32 s5, s3, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_lt_u64_e64 s2, s[4:5], s[2:3]
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_u32 v0, s2, s4, v0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %uadd, 0
%carry = extractvalue { i64, i1 } %uadd, 1
@@ -76,21 +112,21 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-LABEL: s_uaddo_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s10, s6
-; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: v_mov_b32_e32 v0, s9
; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_mov_b32 s8, s2
-; SI-NEXT: s_mov_b32 s9, s3
-; SI-NEXT: v_mov_b32_e32 v0, s13
-; SI-NEXT: v_add_i32_e32 v0, vcc, s12, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, s8, v0
+; SI-NEXT: s_mov_b32 s0, s2
+; SI-NEXT: s_mov_b32 s1, s3
+; SI-NEXT: s_mov_b32 s2, s6
+; SI-NEXT: s_mov_b32 s3, s7
; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0
+; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_uaddo_i32:
@@ -121,6 +157,34 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_byte v0, v2, s[2:3]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_uaddo_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v1, s4, s6, s7
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_uaddo_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v1, s4, s6, s7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: s_endpgm
%uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %uadd, 0
%carry = extractvalue { i32, i1 } %uadd, 1
@@ -137,17 +201,15 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
-; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -193,6 +255,38 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
; GFX9-NEXT: global_store_byte v0, v2, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_uaddo_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_uaddo_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v1, s4, v1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
@@ -215,17 +309,15 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
-; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -283,6 +375,45 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
; GFX9-NEXT: global_store_byte v0, v2, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_uaddo_i32_novcc:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_uaddo_i32_novcc:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v1, s4, v1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
@@ -306,21 +437,21 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_add_u32 s6, s4, s6
-; SI-NEXT: s_addc_u32 s7, s5, s7
-; SI-NEXT: s_mov_b32 s14, s10
-; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s8, s0
-; SI-NEXT: s_mov_b32 s9, s1
-; SI-NEXT: s_mov_b32 s12, s2
-; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: s_addc_u32 s7, s5, s7
; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
; SI-NEXT: v_mov_b32_e32 v2, s6
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s0, s2
+; SI-NEXT: s_mov_b32 s1, s3
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: v_mov_b32_e32 v3, s7
-; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
-; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0
+; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
+; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_uaddo_i64:
@@ -359,6 +490,37 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
; GFX9-NEXT: global_store_byte v4, v0, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_uaddo_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_add_u32 s0, s12, s14
+; GFX10-NEXT: s_addc_u32 s1, s13, s15
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[12:13]
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_byte v2, v3, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_uaddo_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_u32 s6, s4, s6
+; GFX11-NEXT: s_addc_u32 s7, s5, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: s_endpgm
%uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %uadd, 0
%carry = extractvalue { i64, i1 } %uadd, 1
@@ -375,17 +537,15 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -393,8 +553,8 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, v0, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
-; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
+; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -437,6 +597,42 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_byte v4, v0, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_uaddo_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
+; GFX10-NEXT: global_store_byte v4, v0, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_uaddo_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
+; GFX11-NEXT: global_load_b64 v[2:3], v4, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT: global_store_b8 v4, v0, s[2:3]
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i64, ptr addrspace(1) %a.ptr
@@ -459,17 +655,15 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
-; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -477,8 +671,8 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0
-; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, v1, v0
+; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
@@ -522,6 +716,42 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_short v0, v2, s[8:9]
; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_uaddo_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_ushort v1, v0, s[12:13]
+; GFX10-NEXT: global_load_ushort v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v2
+; GFX10-NEXT: v_cmp_lt_u32_sdwa s0, v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX10-NEXT: global_store_short v0, v2, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v1, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_uaddo_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_d16_b16 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_u16 v2, v0, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, v1, v2
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; GFX11-NEXT: v_cmp_lt_u32_e32 vcc_lo, v3, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v1, s[2:3]
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr
@@ -544,17 +774,15 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -606,6 +834,42 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_uaddo_v2i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_uaddo_v2i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
+; GFX11-NEXT: global_load_b64 v[2:3], v4, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v1, s4, v1, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: v_add_co_u32 v0, s4, v0, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[2:3]
+; GFX11-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
%b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
%sadd = call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
@@ -623,26 +887,27 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s1
-; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SI-NEXT: s_cmp_eq_u32 s0, s1
+; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_cbranch_scc1 .LBB8_2
; SI-NEXT: ; %bb.1: ; %if
; SI-NEXT: s_xor_b64 s[0:1], vcc, -1
; SI-NEXT: .LBB8_2: ; %exit
; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; SI-NEXT: s_mov_b32 s10, s2
-; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s0, s4
-; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: s_mov_b32 s8, s6
-; SI-NEXT: s_mov_b32 s9, s7
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
+; SI-NEXT: s_mov_b32 s6, s10
+; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_uaddo_clamp_bit:
@@ -687,6 +952,45 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; GFX9-NEXT: global_store_dword v1, v0, s[8:9]
; GFX9-NEXT: global_store_byte v1, v2, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_uaddo_clamp_bit:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34
+; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v0, s1, s2, s3
+; GFX10-NEXT: s_cmp_eq_u32 s2, s3
+; GFX10-NEXT: s_cbranch_scc1 .LBB8_2
+; GFX10-NEXT: ; %bb.1: ; %if
+; GFX10-NEXT: s_xor_b32 s0, s1, -1
+; GFX10-NEXT: .LBB8_2: ; %exit
+; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dword v1, v0, s[8:9]
+; GFX10-NEXT: global_store_byte v1, v2, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_uaddo_clamp_bit:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x34
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, s1, s2, s3
+; GFX11-NEXT: s_cmp_eq_u32 s2, s3
+; GFX11-NEXT: s_cbranch_scc1 .LBB8_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: s_xor_b32 s0, s1, -1
+; GFX11-NEXT: .LBB8_2: ; %exit
+; GFX11-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_store_b8 v1, v2, s[6:7]
+; GFX11-NEXT: s_endpgm
entry:
%uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %uadd, 0
@@ -711,19 +1015,19 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; SI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s14, s2
-; SI-NEXT: s_mov_b32 s15, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s0, s8
; SI-NEXT: s_mov_b32 s1, s9
-; SI-NEXT: s_mov_b32 s12, s10
-; SI-NEXT: s_mov_b32 s13, s11
+; SI-NEXT: s_mov_b32 s8, s10
+; SI-NEXT: s_mov_b32 s9, s11
+; SI-NEXT: s_mov_b32 s10, s2
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
-; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0
+; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e64 v0, s[0:1], v1, v2
; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: v_add_i32_e64 v0, s[0:1], v1, v2
; SI-NEXT: s_cbranch_vccnz .LBB9_2
; SI-NEXT: ; %bb.1: ; %if
; SI-NEXT: s_xor_b64 s[8:9], s[0:1], -1
@@ -786,6 +1090,50 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_uaddo_clamp_bit:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_add_co_u32 v1, s1, v1, v2
+; GFX10-NEXT: s_cbranch_vccnz .LBB9_2
+; GFX10-NEXT: ; %bb.1: ; %if
+; GFX10-NEXT: s_xor_b32 s0, s1, -1
+; GFX10-NEXT: .LBB9_2: ; %exit
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_uaddo_clamp_bit:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT: v_add_co_u32 v1, s5, v1, v2
+; GFX11-NEXT: s_cbranch_vccnz .LBB9_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: s_xor_b32 s4, s5, -1
+; GFX11-NEXT: .LBB9_2: ; %exit
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -813,23 +1161,23 @@ exit:
define amdgpu_cs void @sv_uaddo_i128(ptr addrspace(1) %out, i128 inreg %a, i128 %b) {
; SI-LABEL: sv_uaddo_i128:
; SI: ; %bb.0:
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; SI-NEXT: v_mov_b32_e32 v6, s1
-; SI-NEXT: v_mov_b32_e32 v7, s2
-; SI-NEXT: v_mov_b32_e32 v8, s3
-; SI-NEXT: s_mov_b32 s4, s6
-; SI-NEXT: s_mov_b32 s5, s6
; SI-NEXT: v_addc_u32_e32 v3, vcc, v6, v3, vcc
-; SI-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc
-; SI-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], v[2:3]
-; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; SI-NEXT: v_addc_u32_e32 v5, vcc, v8, v5, vcc
+; SI-NEXT: v_mov_b32_e32 v6, s2
+; SI-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc
+; SI-NEXT: v_mov_b32_e32 v6, s3
+; SI-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc
+; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3]
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[4:5]
+; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; SI-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5]
+; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; SI-NEXT: s_mov_b32 s5, s6
; SI-NEXT: v_and_b32_e32 v2, 1, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_endpgm
@@ -871,6 +1219,41 @@ define amdgpu_cs void @sv_uaddo_i128(ptr addrspace(1) %out, i128 inreg %a, i128
; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: sv_uaddo_i128:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, s0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v4, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v5, vcc_lo
+; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[2:3]
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[4:5]
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5]
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: sv_uaddo_i128:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, s0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v4, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[4:5]
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[2:3]
+; GFX11-NEXT: v_mov_b16_e32 v2.l, v6.l
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b16 v2.l, v2.l, v3.l, vcc_lo
+; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
%uadd = call { i128, i1 } @llvm.uadd.with.overflow.i128(i128 %a, i128 %b)
%carry = extractvalue { i128, i1 } %uadd, 1
%carry.ext = zext i1 %carry to i32
diff --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
index d0d1ba8..b3166fa 100644
--- a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
+++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
@@ -8,9 +8,8 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; CHECK-NEXT: v_mov_b32_e32 v40, v0
-; CHECK-NEXT: v_pk_mov_b32 v[0:1], 0, 0
-; CHECK-NEXT: flat_load_dword v42, v[0:1]
+; CHECK-NEXT: v_pk_mov_b32 v[44:45], 0, 0
+; CHECK-NEXT: flat_load_dword v42, v[44:45]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x8
@@ -19,48 +18,44 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v46, s6
-; CHECK-NEXT: v_mov_b32_e32 v47, s7
+; CHECK-NEXT: v_accvgpr_write_b32 a32, s6
+; CHECK-NEXT: v_accvgpr_write_b32 a33, s7
; CHECK-NEXT: s_mov_b64 s[6:7], src_private_base
; CHECK-NEXT: s_cmp_lg_u32 s64, -1
; CHECK-NEXT: s_cselect_b32 s7, s7, 0
; CHECK-NEXT: s_cselect_b32 s8, s64, 0
; CHECK-NEXT: s_add_u32 s50, s34, 48
; CHECK-NEXT: s_addc_u32 s51, s35, 0
-; CHECK-NEXT: v_pk_mov_b32 v[58:59], s[4:5], s[4:5] op_sel:[0,1]
+; CHECK-NEXT: v_pk_mov_b32 v[56:57], s[4:5], s[4:5] op_sel:[0,1]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, G@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, G@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0
; CHECK-NEXT: s_mov_b32 s6, 0
-; CHECK-NEXT: v_pk_mov_b32 v[0:1], 0, 0
-; CHECK-NEXT: v_mov_b32_e32 v57, s7
+; CHECK-NEXT: v_mov_b32_e32 v47, s7
; CHECK-NEXT: s_mov_b32 s7, s6
; CHECK-NEXT: s_mov_b32 s53, s14
-; CHECK-NEXT: v_accvgpr_write_b32 a33, v1
-; CHECK-NEXT: v_mov_b32_e32 v56, s8
-; CHECK-NEXT: v_pk_mov_b32 v[60:61], s[6:7], s[6:7] op_sel:[0,1]
+; CHECK-NEXT: v_mov_b32_e32 v46, s8
+; CHECK-NEXT: v_pk_mov_b32 v[58:59], s[6:7], s[6:7] op_sel:[0,1]
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[50:51]
; CHECK-NEXT: s_mov_b32 s12, s14
; CHECK-NEXT: s_mov_b32 s13, s15
; CHECK-NEXT: s_mov_b32 s14, s16
-; CHECK-NEXT: v_mov_b32_e32 v31, v40
+; CHECK-NEXT: v_mov_b32_e32 v31, v0
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_mov_b32 s33, s16
; CHECK-NEXT: s_mov_b32 s52, s15
; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11]
-; CHECK-NEXT: v_accvgpr_write_b32 a32, v0
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61]
+; CHECK-NEXT: v_mov_b32_e32 v40, v0
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[58:59]
; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55]
-; CHECK-NEXT: flat_load_dwordx2 v[62:63], v[58:59]
-; CHECK-NEXT: v_accvgpr_read_b32 v0, a32
-; CHECK-NEXT: v_mov_b32_e32 v44, 0
-; CHECK-NEXT: v_mov_b32_e32 v45, 0x3ff00000
-; CHECK-NEXT: v_accvgpr_read_b32 v1, a33
+; CHECK-NEXT: flat_load_dwordx2 v[60:61], v[56:57]
+; CHECK-NEXT: v_mov_b32_e32 v62, 0
+; CHECK-NEXT: v_mov_b32_e32 v63, 0x3ff00000
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[50:51]
@@ -69,20 +64,20 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK-NEXT: s_mov_b32 s13, s52
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_mov_b32_e32 v31, v40
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[44:45]
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61]
+; CHECK-NEXT: flat_store_dwordx2 v[44:45], v[62:63]
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[58:59]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15
; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55]
-; CHECK-NEXT: flat_load_dwordx2 v[0:1], v[56:57] glc
+; CHECK-NEXT: flat_load_dwordx2 v[0:1], v[46:47] glc
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s64
; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, 0, v42
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[62:63]
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[60:61]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[46:47]
-; CHECK-NEXT: buffer_store_dword v47, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v44, v0, s[0:3], 0 offen
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], a[32:33]
+; CHECK-NEXT: buffer_store_dword a33, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v62, v0, s[0:3], 0 offen
; CHECK-NEXT: ; implicit-def: $vgpr4
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index 7d7f1b4..0289dab 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -1,8 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=VI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s --check-prefix=GFX9
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GFX10
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11
define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
; SI-LABEL: s_usubo_i64_zext:
@@ -13,14 +14,14 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_sub_u32 s0, s2, s8
; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_subb_u32 s1, s3, s9
+; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -62,6 +63,40 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_usubo_i64_zext:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_sub_u32 s4, s2, s6
+; GFX10-NEXT: s_subb_u32 s5, s3, s7
+; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], s[2:3]
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX10-NEXT: v_add_co_u32 v0, s2, s4, v0
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s5, 0, s2
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_usubo_i64_zext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_sub_u32 s4, s2, s4
+; GFX11-NEXT: s_subb_u32 s5, s3, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], s[2:3]
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_u32 v0, s2, s4, v0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0
%val = extractvalue { i64, i1 } %usub, 0
%carry = extractvalue { i64, i1 } %usub, 1
@@ -76,21 +111,21 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-LABEL: s_usubo_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s10, s6
-; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: v_mov_b32_e32 v0, s9
; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_mov_b32 s8, s2
-; SI-NEXT: s_mov_b32 s9, s3
-; SI-NEXT: v_mov_b32_e32 v0, s13
-; SI-NEXT: v_sub_i32_e32 v0, vcc, s12, v0
+; SI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0
+; SI-NEXT: s_mov_b32 s0, s2
+; SI-NEXT: s_mov_b32 s1, s3
+; SI-NEXT: s_mov_b32 s2, s6
+; SI-NEXT: s_mov_b32 s3, s7
; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0
+; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_usubo_i32:
@@ -121,6 +156,34 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_byte v0, v2, s[2:3]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_usubo_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_sub_co_u32 v1, s4, s6, s7
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_usubo_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v1, s4, s6, s7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: s_endpgm
%usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %usub, 0
%carry = extractvalue { i32, i1 } %usub, 1
@@ -137,17 +200,15 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
-; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -193,6 +254,38 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
; GFX9-NEXT: global_store_byte v0, v2, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_usubo_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_co_u32 v1, s0, v1, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_usubo_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v1, s4, v1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
@@ -215,17 +308,15 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
-; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -283,6 +374,45 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
; GFX9-NEXT: global_store_byte v0, v2, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_usubo_i32_novcc:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_co_u32 v1, s0, v1, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_usubo_i32_novcc:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v1, s4, v1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
@@ -306,21 +436,21 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_sub_u32 s6, s4, s6
-; SI-NEXT: s_subb_u32 s7, s5, s7
-; SI-NEXT: s_mov_b32 s14, s10
-; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s8, s0
-; SI-NEXT: s_mov_b32 s9, s1
-; SI-NEXT: s_mov_b32 s12, s2
-; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: s_subb_u32 s7, s5, s7
; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
; SI-NEXT: v_mov_b32_e32 v2, s6
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s0, s2
+; SI-NEXT: s_mov_b32 s1, s3
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: v_mov_b32_e32 v3, s7
-; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
-; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0
+; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
+; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_usubo_i64:
@@ -359,6 +489,37 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
; GFX9-NEXT: global_store_byte v4, v0, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_usubo_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_sub_u32 s0, s12, s14
+; GFX10-NEXT: s_subb_u32 s1, s13, s15
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[12:13]
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_byte v2, v3, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_usubo_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_sub_u32 s6, s4, s6
+; GFX11-NEXT: s_subb_u32 s7, s5, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: s_endpgm
%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %usub, 0
%carry = extractvalue { i64, i1 } %usub, 1
@@ -375,17 +536,15 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -393,8 +552,8 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_sub_i32_e32 v2, vcc, v0, v2
; SI-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
-; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
+; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -437,6 +596,42 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_byte v4, v0, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_usubo_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
+; GFX10-NEXT: global_store_byte v4, v0, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_usubo_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
+; GFX11-NEXT: global_load_b64 v[2:3], v4, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT: global_store_b8 v4, v0, s[2:3]
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i64, ptr addrspace(1) %a.ptr
@@ -459,17 +654,15 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
-; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -477,8 +670,8 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0
-; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, v1, v0
+; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
@@ -522,6 +715,42 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_short v0, v2, s[8:9]
; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_usubo_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_ushort v1, v0, s[12:13]
+; GFX10-NEXT: global_load_ushort v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_nc_u32_e32 v2, v1, v2
+; GFX10-NEXT: v_cmp_gt_u32_sdwa s0, v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX10-NEXT: global_store_short v0, v2, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v1, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_usubo_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_d16_b16 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_u16 v2, v0, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_nc_u32_e32 v2, v1, v2
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, v3, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v1, s[2:3]
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr
@@ -544,17 +773,15 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -606,6 +833,42 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_usubo_v2i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_co_u32 v1, s0, v1, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT: v_sub_co_u32 v0, s0, v0, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_usubo_v2i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
+; GFX11-NEXT: global_load_b64 v[2:3], v4, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v1, s4, v1, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: v_sub_co_u32 v0, s4, v0, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[2:3]
+; GFX11-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
%b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
%sadd = call { <2 x i32>, <2 x i1> } @llvm.usub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
@@ -623,26 +886,27 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s1
-; SI-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
; SI-NEXT: s_cmp_eq_u32 s0, s1
+; SI-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_cbranch_scc1 .LBB8_2
; SI-NEXT: ; %bb.1: ; %if
; SI-NEXT: s_xor_b64 s[0:1], vcc, -1
; SI-NEXT: .LBB8_2: ; %exit
; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; SI-NEXT: s_mov_b32 s10, s2
-; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s0, s4
-; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: s_mov_b32 s8, s6
-; SI-NEXT: s_mov_b32 s9, s7
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
+; SI-NEXT: s_mov_b32 s6, s10
+; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_usubo_clamp_bit:
@@ -687,6 +951,45 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; GFX9-NEXT: global_store_dword v1, v0, s[8:9]
; GFX9-NEXT: global_store_byte v1, v2, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_usubo_clamp_bit:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34
+; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_sub_co_u32 v0, s1, s2, s3
+; GFX10-NEXT: s_cmp_eq_u32 s2, s3
+; GFX10-NEXT: s_cbranch_scc1 .LBB8_2
+; GFX10-NEXT: ; %bb.1: ; %if
+; GFX10-NEXT: s_xor_b32 s0, s1, -1
+; GFX10-NEXT: .LBB8_2: ; %exit
+; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dword v1, v0, s[8:9]
+; GFX10-NEXT: global_store_byte v1, v2, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_usubo_clamp_bit:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x34
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v0, s1, s2, s3
+; GFX11-NEXT: s_cmp_eq_u32 s2, s3
+; GFX11-NEXT: s_cbranch_scc1 .LBB8_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: s_xor_b32 s0, s1, -1
+; GFX11-NEXT: .LBB8_2: ; %exit
+; GFX11-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_store_b8 v1, v2, s[6:7]
+; GFX11-NEXT: s_endpgm
entry:
%usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %usub, 0
@@ -712,19 +1015,19 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; SI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s14, s2
-; SI-NEXT: s_mov_b32 s15, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s0, s8
; SI-NEXT: s_mov_b32 s1, s9
-; SI-NEXT: s_mov_b32 s12, s10
-; SI-NEXT: s_mov_b32 s13, s11
+; SI-NEXT: s_mov_b32 s8, s10
+; SI-NEXT: s_mov_b32 s9, s11
+; SI-NEXT: s_mov_b32 s10, s2
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
-; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0
+; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_sub_i32_e64 v0, s[0:1], v1, v2
; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: v_sub_i32_e64 v0, s[0:1], v1, v2
; SI-NEXT: s_cbranch_vccnz .LBB9_2
; SI-NEXT: ; %bb.1: ; %if
; SI-NEXT: s_xor_b64 s[8:9], s[0:1], -1
@@ -787,6 +1090,50 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_usubo_clamp_bit:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_sub_co_u32 v1, s1, v1, v2
+; GFX10-NEXT: s_cbranch_vccnz .LBB9_2
+; GFX10-NEXT: ; %bb.1: ; %if
+; GFX10-NEXT: s_xor_b32 s0, s1, -1
+; GFX10-NEXT: .LBB9_2: ; %exit
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_usubo_clamp_bit:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT: v_sub_co_u32 v1, s5, v1, v2
+; GFX11-NEXT: s_cbranch_vccnz .LBB9_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: s_xor_b32 s4, s5, -1
+; GFX11-NEXT: .LBB9_2: ; %exit
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 2f25a93..fe7def8a 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -1961,16 +1961,15 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1)
; GFX942-LABEL: shuffle_v6f16_452367:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: global_load_dwordx3 v[0:2], v[6:7], off
-; GFX942-NEXT: global_load_dword v3, v[4:5], off
+; GFX942-NEXT: global_load_dwordx3 v[4:6], v[0:1], off
+; GFX942-NEXT: global_load_dword v4, v[2:3], off
+; GFX942-NEXT: ; kill: killed $vgpr0 killed $vgpr1
+; GFX942-NEXT: ; kill: killed $vgpr2 killed $vgpr3
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: shuffle_v6f16_452367:
@@ -5151,16 +5150,15 @@ define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace
; GFX942-LABEL: shuffle_v6bf16_452367:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: global_load_dwordx3 v[0:2], v[6:7], off
-; GFX942-NEXT: global_load_dword v3, v[4:5], off
+; GFX942-NEXT: global_load_dwordx3 v[4:6], v[0:1], off
+; GFX942-NEXT: global_load_dword v4, v[2:3], off
+; GFX942-NEXT: ; kill: killed $vgpr0 killed $vgpr1
+; GFX942-NEXT: ; kill: killed $vgpr2 killed $vgpr3
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: shuffle_v6bf16_452367:
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index a401f989..d8264b5a 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -58,19 +58,19 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: v_and_b32_e32 v3, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 2, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v3
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX942-NEXT: global_load_dword v2, v1, s[0:1]
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v3
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB1_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dword v1, v2, s[2:3]
+; GFX942-NEXT: global_load_dword v2, v1, s[2:3]
; GFX942-NEXT: .LBB1_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX942-NEXT: global_store_dword v0, v2, s[6:7]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -136,19 +136,19 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[0:1]
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB3_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[2:3]
; GFX942-NEXT: .LBB3_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[6:7]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -173,19 +173,19 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: v_and_b32_e32 v6, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, 4, v6
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 4, v6
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1]
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1]
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v6
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB4_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3]
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[2:3]
; GFX942-NEXT: .LBB4_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -210,23 +210,23 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: v_and_b32_e32 v10, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v9, 5, v10
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 5, v10
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx4 v[4:7], v9, s[0:1] offset:16
-; GFX942-NEXT: global_load_dwordx4 v[0:3], v9, s[0:1]
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[0:1] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1]
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v10
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB5_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx4 v[4:7], v9, s[2:3] offset:16
-; GFX942-NEXT: global_load_dwordx4 v[0:3], v9, s[2:3]
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[2:3] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[2:3]
; GFX942-NEXT: .LBB5_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] offset:16
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7]
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -250,72 +250,72 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v2
+; GFX942-NEXT: v_and_b32_e32 v62, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v62
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx4 v[28:31], v1, s[0:1] offset:240
-; GFX942-NEXT: global_load_dwordx4 v[24:27], v1, s[0:1] offset:224
-; GFX942-NEXT: global_load_dwordx4 v[20:23], v1, s[0:1] offset:208
-; GFX942-NEXT: global_load_dwordx4 v[16:19], v1, s[0:1] offset:192
-; GFX942-NEXT: global_load_dwordx4 v[12:15], v1, s[0:1] offset:176
-; GFX942-NEXT: global_load_dwordx4 v[8:11], v1, s[0:1] offset:160
-; GFX942-NEXT: global_load_dwordx4 v[4:7], v1, s[0:1] offset:144
-; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[0:1] offset:128
-; GFX942-NEXT: global_load_dwordx4 v[60:63], v1, s[0:1] offset:112
-; GFX942-NEXT: global_load_dwordx4 v[56:59], v1, s[0:1] offset:96
-; GFX942-NEXT: global_load_dwordx4 v[52:55], v1, s[0:1] offset:80
-; GFX942-NEXT: global_load_dwordx4 v[48:51], v1, s[0:1] offset:64
-; GFX942-NEXT: global_load_dwordx4 v[44:47], v1, s[0:1] offset:48
-; GFX942-NEXT: global_load_dwordx4 v[40:43], v1, s[0:1] offset:32
-; GFX942-NEXT: global_load_dwordx4 v[36:39], v1, s[0:1] offset:16
-; GFX942-NEXT: global_load_dwordx4 v[32:35], v1, s[0:1]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v2
+; GFX942-NEXT: global_load_dwordx4 v[30:33], v1, s[0:1] offset:240
+; GFX942-NEXT: global_load_dwordx4 v[26:29], v1, s[0:1] offset:224
+; GFX942-NEXT: global_load_dwordx4 v[22:25], v1, s[0:1] offset:208
+; GFX942-NEXT: global_load_dwordx4 v[18:21], v1, s[0:1] offset:192
+; GFX942-NEXT: global_load_dwordx4 v[14:17], v1, s[0:1] offset:176
+; GFX942-NEXT: global_load_dwordx4 v[10:13], v1, s[0:1] offset:160
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[0:1] offset:144
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1] offset:128
+; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[0:1] offset:112
+; GFX942-NEXT: global_load_dwordx4 v[58:61], v1, s[0:1] offset:96
+; GFX942-NEXT: global_load_dwordx4 v[54:57], v1, s[0:1] offset:80
+; GFX942-NEXT: global_load_dwordx4 v[50:53], v1, s[0:1] offset:64
+; GFX942-NEXT: global_load_dwordx4 v[46:49], v1, s[0:1] offset:48
+; GFX942-NEXT: global_load_dwordx4 v[42:45], v1, s[0:1] offset:32
+; GFX942-NEXT: global_load_dwordx4 v[38:41], v1, s[0:1] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[34:37], v1, s[0:1]
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v62
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB6_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx4 v[28:31], v1, s[2:3] offset:240
-; GFX942-NEXT: global_load_dwordx4 v[24:27], v1, s[2:3] offset:224
-; GFX942-NEXT: global_load_dwordx4 v[20:23], v1, s[2:3] offset:208
-; GFX942-NEXT: global_load_dwordx4 v[16:19], v1, s[2:3] offset:192
-; GFX942-NEXT: global_load_dwordx4 v[12:15], v1, s[2:3] offset:176
-; GFX942-NEXT: global_load_dwordx4 v[8:11], v1, s[2:3] offset:160
-; GFX942-NEXT: global_load_dwordx4 v[4:7], v1, s[2:3] offset:144
-; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[2:3] offset:128
-; GFX942-NEXT: global_load_dwordx4 v[60:63], v1, s[2:3] offset:112
-; GFX942-NEXT: global_load_dwordx4 v[56:59], v1, s[2:3] offset:96
-; GFX942-NEXT: global_load_dwordx4 v[52:55], v1, s[2:3] offset:80
-; GFX942-NEXT: global_load_dwordx4 v[48:51], v1, s[2:3] offset:64
-; GFX942-NEXT: global_load_dwordx4 v[44:47], v1, s[2:3] offset:48
-; GFX942-NEXT: global_load_dwordx4 v[40:43], v1, s[2:3] offset:32
-; GFX942-NEXT: global_load_dwordx4 v[36:39], v1, s[2:3] offset:16
-; GFX942-NEXT: global_load_dwordx4 v[32:35], v1, s[2:3]
+; GFX942-NEXT: global_load_dwordx4 v[30:33], v1, s[2:3] offset:240
+; GFX942-NEXT: global_load_dwordx4 v[26:29], v1, s[2:3] offset:224
+; GFX942-NEXT: global_load_dwordx4 v[22:25], v1, s[2:3] offset:208
+; GFX942-NEXT: global_load_dwordx4 v[18:21], v1, s[2:3] offset:192
+; GFX942-NEXT: global_load_dwordx4 v[14:17], v1, s[2:3] offset:176
+; GFX942-NEXT: global_load_dwordx4 v[10:13], v1, s[2:3] offset:160
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[2:3] offset:144
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[2:3] offset:128
+; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[2:3] offset:112
+; GFX942-NEXT: global_load_dwordx4 v[58:61], v1, s[2:3] offset:96
+; GFX942-NEXT: global_load_dwordx4 v[54:57], v1, s[2:3] offset:80
+; GFX942-NEXT: global_load_dwordx4 v[50:53], v1, s[2:3] offset:64
+; GFX942-NEXT: global_load_dwordx4 v[46:49], v1, s[2:3] offset:48
+; GFX942-NEXT: global_load_dwordx4 v[42:45], v1, s[2:3] offset:32
+; GFX942-NEXT: global_load_dwordx4 v[38:41], v1, s[2:3] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[34:37], v1, s[2:3]
; GFX942-NEXT: .LBB6_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[60:63], s[6:7] offset:112
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] offset:112
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[56:59], s[6:7] offset:96
+; GFX942-NEXT: global_store_dwordx4 v0, v[58:61], s[6:7] offset:96
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[52:55], s[6:7] offset:80
+; GFX942-NEXT: global_store_dwordx4 v0, v[54:57], s[6:7] offset:80
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[48:51], s[6:7] offset:64
+; GFX942-NEXT: global_store_dwordx4 v0, v[50:53], s[6:7] offset:64
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[44:47], s[6:7] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, v[46:49], s[6:7] offset:48
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[40:43], s[6:7] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, v[42:45], s[6:7] offset:32
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[36:39], s[6:7] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, v[38:41], s[6:7] offset:16
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[32:35], s[6:7]
-; GFX942-NEXT: global_store_dwordx4 v0, v[28:31], s[6:7] offset:240
-; GFX942-NEXT: global_store_dwordx4 v0, v[24:27], s[6:7] offset:224
-; GFX942-NEXT: global_store_dwordx4 v0, v[20:23], s[6:7] offset:208
-; GFX942-NEXT: global_store_dwordx4 v0, v[16:19], s[6:7] offset:192
-; GFX942-NEXT: global_store_dwordx4 v0, v[12:15], s[6:7] offset:176
-; GFX942-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7] offset:160
-; GFX942-NEXT: global_store_dwordx4 v0, v[4:7], s[6:7] offset:144
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] offset:128
+; GFX942-NEXT: global_store_dwordx4 v0, v[34:37], s[6:7]
+; GFX942-NEXT: global_store_dwordx4 v0, v[30:33], s[6:7] offset:240
+; GFX942-NEXT: global_store_dwordx4 v0, v[26:29], s[6:7] offset:224
+; GFX942-NEXT: global_store_dwordx4 v0, v[22:25], s[6:7] offset:208
+; GFX942-NEXT: global_store_dwordx4 v0, v[18:21], s[6:7] offset:192
+; GFX942-NEXT: global_store_dwordx4 v0, v[14:17], s[6:7] offset:176
+; GFX942-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] offset:160
+; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] offset:144
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7] offset:128
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -391,17 +391,17 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-LABEL: v8i8_phi_chain:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v2
-; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v2
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v2
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[8:9]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[8:9]
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB8_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[10:11]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v2
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[10:11]
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
@@ -410,14 +410,14 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
; GFX942-NEXT: s_cbranch_execz .LBB8_4
; GFX942-NEXT: ; %bb.3: ; %bb.2
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[12:13]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[12:13]
; GFX942-NEXT: .LBB8_4: ; %bb.3
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[14:15]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -447,38 +447,38 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa
; GFX942-LABEL: v8i8_phi_zeroinit:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, 3, v4
-; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v4
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v5, s[8:9]
-; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[8:9]
+; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB9_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[2:3], v5, s[10:11]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v4
+; GFX942-NEXT: global_load_dwordx2 v[4:5], v1, s[10:11]
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX942-NEXT: .LBB9_2: ; %Flow
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
; GFX942-NEXT: s_cbranch_execz .LBB9_4
; GFX942-NEXT: ; %bb.3: ; %bb.2
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[12:13]
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[12:13]
; GFX942-NEXT: .LBB9_4: ; %bb.3
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[14:15]
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: global_store_dwordx2 v0, v[4:5], s[14:15]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -617,30 +617,30 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
; GFX942-LABEL: v8i8_multi_block:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT: v_and_b32_e32 v5, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v6, 3, v5
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v5
+; GFX942-NEXT: v_and_b32_e32 v3, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v3
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v3
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[2:3], v6, s[8:9]
+; GFX942-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB11_4
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v6, s[10:11]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v5
+; GFX942-NEXT: global_load_dwordx2 v[6:7], v4, s[10:11]
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v3
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB11_3
; GFX942-NEXT: ; %bb.2: ; %bb.2
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[12:13]
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[12:13]
; GFX942-NEXT: .LBB11_3: ; %Flow
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: .LBB11_4: ; %bb.3
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[14:15]
+; GFX942-NEXT: global_store_dwordx2 v2, v[6:7], s[14:15]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -859,15 +859,15 @@ define amdgpu_kernel void @v8i8_mfma_i8(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[8:9]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[8:9]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB14_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[10:11]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[10:11]
; GFX942-NEXT: .LBB14_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[14:15], 0x0
@@ -878,9 +878,9 @@ define amdgpu_kernel void @v8i8_mfma_i8(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 6
-; GFX942-NEXT: global_store_dwordx4 v2, a[0:3], s[12:13]
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[12:13]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -909,15 +909,15 @@ define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace(
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[36:43], s[4:5], 0x24
; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[36:37]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[36:37]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB15_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[38:39]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[38:39]
; GFX942-NEXT: .LBB15_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_load_dwordx16 s[16:31], s[42:43], 0x0
@@ -957,18 +957,18 @@ define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-NEXT: v_accvgpr_write_b32 a31, s15
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[0:1], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 2
-; GFX942-NEXT: global_store_dwordx4 v2, a[28:31], s[40:41] offset:112
-; GFX942-NEXT: global_store_dwordx4 v2, a[24:27], s[40:41] offset:96
-; GFX942-NEXT: global_store_dwordx4 v2, a[20:23], s[40:41] offset:80
-; GFX942-NEXT: global_store_dwordx4 v2, a[16:19], s[40:41] offset:64
-; GFX942-NEXT: global_store_dwordx4 v2, a[12:15], s[40:41] offset:48
-; GFX942-NEXT: global_store_dwordx4 v2, a[8:11], s[40:41] offset:32
-; GFX942-NEXT: global_store_dwordx4 v2, a[4:7], s[40:41] offset:16
-; GFX942-NEXT: global_store_dwordx4 v2, a[0:3], s[40:41]
+; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[40:41] offset:112
+; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[40:41] offset:96
+; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[40:41] offset:80
+; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[40:41] offset:64
+; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[40:41] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[40:41] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[40:41] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[40:41]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll b/llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll
index 145f1e4..ff18b32 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll
@@ -2,7 +2,7 @@
; A negative test to capture the expected error when the VGPRs are insufficient for wwm-regalloc.
-; CHECK: error: can't find enough VGPRs for wwm-regalloc
+; CHECK: error: cannot find enough VGPRs for wwm-regalloc
define amdgpu_kernel void @test(i32 %in) {
entry: